profile
viewpoint
Sanjoy Das sanjoy @google @GoogleCloudPlatform CA http://playingwithpointers.com Compilers and VMs

sanjoy/DietLISP 33

Toy lisp interpreter / PLT playground

sanjoy/CmdArgs 19

Painless command line argument parsing in C.

sanjoy/L 5

A Small Evaluator for Untyped Lambda Calculus

sanjoy/clayoven 1

Modern website generator with a traditional design

sanjoy/eelish 1

n00b programming without locks

sanjoy/Me 1

Repository to track some of my configuration files.

Pull request review commenttensorflow/tensorflow

[WIP] DLPack functions

+#include "tensorflow/c/eager/dlpack.h"+#include "include/dlpack/dlpack.h"  // TF:dlpack+#include "tensorflow/c/eager/c_api_internal.h"+#include "tensorflow/c/tf_status_helper.h"+#include "tensorflow/core/framework/tensor.h"+#include "tensorflow/core/platform/casts.h"++#include "tensorflow/core/framework/tensor_reference.h"+#include "tensorflow/core/platform/logging.h"++namespace tensorflow {++using tensorflow::Tensor;+using tensorflow::TensorHandleInterface;++namespace {++struct TFDLMTensor {

Unsure what the "M" stands for. Can you please expand the name a bit?

VoVAllen

comment created time in 9 hours

Pull request review commenttensorflow/tensorflow

[WIP] DLPack functions

+#include "tensorflow/c/eager/dlpack.h"

Needs coypright header, see other .cc files in the directory.

VoVAllen

comment created time in 10 hours

Pull request review commenttensorflow/tensorflow

[WIP] DLPack functions

+#include "tensorflow/c/eager/dlpack.h"+#include "include/dlpack/dlpack.h"  // TF:dlpack

Where will you get this header from?

VoVAllen

comment created time in 9 hours

Pull request review commenttensorflow/tensorflow

[WIP] DLPack functions

+#include "tensorflow/c/eager/dlpack.h"+#include "include/dlpack/dlpack.h"  // TF:dlpack+#include "tensorflow/c/eager/c_api_internal.h"+#include "tensorflow/c/tf_status_helper.h"+#include "tensorflow/core/framework/tensor.h"+#include "tensorflow/core/platform/casts.h"++#include "tensorflow/core/framework/tensor_reference.h"+#include "tensorflow/core/platform/logging.h"++namespace tensorflow {++using tensorflow::Tensor;+using tensorflow::TensorHandleInterface;++namespace {++struct TFDLMTensor {+  TensorReference* handle;+  DLManagedTensor tensor;+};++TensorHandle* GetTensorHandleFromTFEHandle(TFE_TensorHandle* h,+                                           TF_Status* status) {+  if (h == nullptr || !h->handle->IsValid(&status->status)) {+    status->status = tensorflow::errors::InvalidArgument(+        "The passed in handle is a nullptr");+    return nullptr;+  }+  tensorflow::TensorHandle* handle =+      tensorflow::down_cast<tensorflow::TensorHandleInterface*>(h->handle.get())+          ->Handle();++  if (handle->IsRemote()) {+    status->status = tensorflow::errors::InvalidArgument(+        "TFE_TensorHandleDevicePointer may not be called on a remote tensor "+        "handle.");+    return nullptr;+  }+  return handle;+}++const Tensor* GetTensorFromHandle(TFE_TensorHandle* h, TF_Status* status) {+  TensorHandle* handle = GetTensorHandleFromTFEHandle(h, status);++  if (handle->IsRemote()) {+    status->status = tensorflow::errors::InvalidArgument(+        "TFE_TensorHandleDevicePointer may not be called on a remote tensor "+        "handle.");+    return nullptr;+  }+  tensorflow::Device* device(absl::get<tensorflow::Device*>(handle->device()));+  if (device != nullptr) {+    status->status = device->Sync();+    if (!status->status.ok()) {+      return nullptr;+    }+  }+  const tensorflow::Tensor* tensor;+  status->status = handle->Tensor(&tensor);+  if (!status->status.ok()) {+    return nullptr;+  }+  return tensor;+};++void deleter(DLManagedTensor* arg) {+  TFDLMTensor* owner = static_cast<TFDLMTensor*>(arg->manager_ctx);+  owner->handle->Unref();+  delete owner;+}++DLDataType getDLDataType(TF_DataType data_type, TF_Status* status) {+  DLDataType dtype;+  dtype.lanes = 1;+  dtype.bits = TF_DataTypeSize(data_type) * 8;+  switch (data_type) {+    case TF_DataType::TF_FLOAT:+      dtype.code = DLDataTypeCode::kDLFloat;+      break;+    case TF_DataType::TF_DOUBLE:+      dtype.code = DLDataTypeCode::kDLFloat;+      break;+    case TF_DataType::TF_INT32:+      dtype.code = DLDataTypeCode::kDLInt;+      break;+    case TF_DataType::TF_UINT8:+      dtype.code = DLDataTypeCode::kDLUInt;+      break;+    case TF_DataType::TF_INT16:+      dtype.code = DLDataTypeCode::kDLInt;+      break;+    case TF_DataType::TF_STRING:+      dtype.code = DLDataTypeCode::kDLFloat;+      break;+    case TF_DataType::TF_COMPLEX64:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_COMPLEX64 is not supported by dlpack");+      break;+    case TF_DataType::TF_INT64:+      dtype.code = DLDataTypeCode::kDLInt;+      break;+    case TF_DataType::TF_BOOL:+      dtype.code = DLDataTypeCode::kDLUInt;+      break;+    case TF_DataType::TF_QINT8:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_QINT8 is not supported by dlpack");+      break;+    case TF_DataType::TF_QUINT8:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_QUINT8 is not supported by dlpack");+      break;+    case TF_DataType::TF_QINT32:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_QINT32 is not supported by dlpack");+      break;+    case TF_DataType::TF_BFLOAT16:+      dtype.code = DLDataTypeCode::kDLBfloat;+      break;+    case TF_DataType::TF_QINT16:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_QINT16 is not supported by dlpack");+      break;+    case TF_DataType::TF_QUINT16:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_QUINT16 is not supported by dlpack");+      break;+    case TF_DataType::TF_COMPLEX128:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_COMPLEX128 is not supported by dlpack");+      break;+    case TF_DataType::TF_HALF:+      dtype.code = DLDataTypeCode::kDLFloat;+      break;+    case TF_DataType::TF_RESOURCE:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_RESOURCE is not supported by dlpack");+      break;+    case TF_DataType::TF_VARIANT:+      status->status = tensorflow::errors::InvalidArgument(+          "TF_VARIANT is not supported by dlpack");+      break;+    case TF_DataType::TF_UINT32:+      dtype.code = DLDataTypeCode::kDLUInt;+      break;+    case TF_DataType::TF_UINT64:+      dtype.code = DLDataTypeCode::kDLUInt;+      break;

This seems wrong: why are both uint32 and uint64 mapped to the same DL data type?

VoVAllen

comment created time in 9 hours

issue commenttensorflow/tensorflow

tensorflow java GPU compute capabilties 6.0 instead of 3.7

I believe the culprit is this line: https://github.com/tensorflow/tensorflow/blob/aa50d2b624c7e8d56b4b1644c4ccf489d8e8c55c/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh#L41-L44

The comment seems to indicate that this env variable is not used. I don't really understand the setup but it looks like the Java build is picking it up.

How did you triage the problem to that line? Is it because that's the only place that uses 6.0 as a compute capability? If so, that might be a red herring.

IIUC the TF JNI GPU packages are uploaded here https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz.

@gunan Do you know how these JNI packages are built and uploaded?

callicles

comment created time in a day

issue commenttensorflow/tensorflow

High RAM Usage for TF Runtime?

I believe RSS includes all of the CUDA libraries TensorFlow loads which could explain the large memory footprint you're seeing. Given that, I would expect multiple TF gpu processes to share the actual physical memory for these loaded shared objects.

yetanotheryeti

comment created time in 3 days

issue commenttensorflow/tensorflow

How can I clear GPU memory in tensorflow 2?

@sanjoy I think that nvidia-smi does not list GPU processes when used within Docker (as in my case)

I see, thanks!

How do you exit the TF processes?

HristoBuyukliev

comment created time in 3 days

issue commenttensorflow/tensorflow

When i try run the `tf.keras.layers.Bidirectional` on my windows system, it turns out CancelledError!

Can you please check if this helps:

And does preventing TF from allocating all of the GPU memory at startup (instructions) help?

shazhongcheng

comment created time in 3 days

issue commenttensorflow/tensorflow

Failed to get device attribute 13 for device 0

I suspect MX230 is not CUDA capable, I don't see it listed on https://developer.nvidia.com/cuda-gpus. @nluehr Any idea?

For completeness, attribute 13 is CU_DEVICE_ATTRIBUTE_CLOCK_RATE.

Dominux

comment created time in 3 days

issue commenttensorflow/tensorflow

When i try run the `tf.keras.layers.Bidirectional` on my windows system, it turns out CancelledError!

Hello! I try tensorflow-gpu-2.1 and cuda10.1

But still run error!

Unfortunately the Python program works fine on my local machine (P100): log.

Are you able to run RNN cells using cuDNN on your machine normally? And does preventing TF from allocating all of the GPU memory at startup (instructions) help?

shazhongcheng

comment created time in 3 days

issue commenttensorflow/tensorflow

Tensorflow 1.15 doesn't do incremental memory growth

Are you implying that memory growth is not happening any more on 1.15 and RTX2070.

allow_growth is a core TF level setting. When it is set to false (default) TensorFlow itself allocates almost all of the GPU memory into its memory pool and uses this memory pool to satisfy memory allocation requests from the various GPU kernels. When it is set to true then TF starts of allocating only a small amount of memory into the pool, and expands the pool as needed (e.g. if the pool has 1G of memory free and a GPU kernel wants 2G of memory then TF will allocate 1G more from the GPU runtime).

However, allow_growth has no bearing on how much memory will actually be demanded by GPU kernels. As an extreme example, if you have a GPU kernel that by itself needs all the memory the GPU has available, then irrespective of whether allow_growth is true or false, the program will immediately consume all of the GPU's memory when this kernel runs.

I suspect that this is what is happening here. We have not stopped respecting allow_growth, but the newer cuDNN probably has some algorithms that ask for more scratch memory in order to run faster.

Specifically, if I disable autotuning by setting the environment variable TF_CUDNN_USE_AUTOTUNE to false then the memory use stabilizes to 1857MiB (on my Titan-V). If you are okay with the performance hit of disabling autotuning then this could be a good workaround for you.

CMCDragonkai

comment created time in 3 days

issue commenttensorflow/tensorflow

Tensorflow 1.15 doesn't do incremental memory growth

This behaviour is different from before. When allow memory growth was enabled I saw a progressive incremental gpu memory growth.

It would be nice to have some more information here (what TF versions are you comparing?) but ultimately I'm not sure if there is much for us to do here. The could just be a matter of the executor executing the graph in a different order.

CMCDragonkai

comment created time in 4 days

issue commenttensorflow/tensorflow

Could not create cudnn handle: CUDNN_STATUS_INTERNAL_ERROR

@chsigg Any suggestions? Maybe we can try to initialize cuDNN, cuBLAS and other NVIDIA libraries before we reserve all of the GPU memory?

We can also try to enable allow_growth by default, but that's going to take time.

6etacat

comment created time in 4 days

issue commenttensorflow/tensorflow

Image Classification pretrained model breaks for batch size 1 with BaseCollectiveExecutor::StartAbort error

Unfortunately the original author of image_classification.py has moved on to other things. @DEKHTIARJonathan do you want to take a look?

mankeyboy

comment created time in 4 days

issue commenttensorflow/tensorflow

Tensorflow 1.15 doesn't do incremental memory growth

As I watch nvidia-smi, I always see almost the entire GPUs allocated.

allow_growth only means that TF will start off with allocating only part of the GPU memory, but there is no limit to how much of the GPU memory it can use over the execution of the program (i.e. over time, the GPU memory usage can grow). Is this what you are observing? If so, I think this is WAI.

To keep TF from allocation more than half the GPU for the lifetime of the process, you need to use per_process_gpu_memory_fraction, as you said you are doing.

In all other cases I see the the GPU memories get used up entirely and I get these messages:

Are these fatal errors or does the program continue to work? TF often tries to allocate memory speculatively to use as workspace for cuDNN convolutions, but if the allocation fails it just falls back to a slower algorithm that uses less memory. So if the OOM warnings are non-fatal then this is WAI too, although it does indicate the logging output could be made more helpful.

CMCDragonkai

comment created time in 6 days

issue commenttensorflow/tensorflow

GPU memory not released until Java process terminates

Is there a way to turn this into a feature request, if it doesn't already exist?

This GH issue can serve a feature request. We don't have anyone working on this in Q1 though.

TheSentry

comment created time in 6 days

issue commenttensorflow/tensorflow

[TF2] TRT Engine Ops are not garbage collected, resulting in incorrect reuse

@bixia1 @DEKHTIARJonathan Do either of you have cycles to pick this up?

MattConley

comment created time in 6 days

issue commenttensorflow/tensorflow

[TF2] TRT Engine Ops are not garbage collected, resulting in incorrect reuse

This forced garbage collection is likely an attempted workaround for the issue: https://github.com/tensorflow/tensorflow/blob/v2.1.0/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py#L372

That looks super fishy to me as well -- we should never need to rely on GC for correctness.

Is this happening because we are creating key-ing TRT engines on the tensor shapes of the inputs and ignoring the dtypes? I.e. the key does not capture enough information?

If so I'd suggest the following fix:

  • Make sure each TRTEngineOp has a segment_func with a (process wide) different name.
  • Key the EngineContext on the tensor shapes, dtypes, ID and segment_func name.
MattConley

comment created time in 6 days

issue commenttensorflow/tensorflow

How can I clear GPU memory in tensorflow 2?

However, the only way I can then release the GPU memory is to restart my computer.

How do you exit the TF processes?

This looks like an issue with nvidia-smi based on your last comment. If lsof /dev/nvidia2 can find the processes using the GPU then nvidia-smi should find them as well.

HristoBuyukliev

comment created time in 11 days

pull request commenttensorflow/tensorflow

Enable preventing engine build at runtime

My machine does have two GPUs so that's probably it, but I haven't checked.

Just to clarify, the next step here is: someone ( @DEKHTIARJonathan ?) needs to commandeer this PR and apply the patch from the previous comment. Then the tests should start passing and we can continue reviewing the PR.

pooyadavoodi

comment created time in 12 days

pull request commenttensorflow/tensorflow

Add NVTX Ranges

Hi I have benchmarked Tensorflow with/without this PR, here are the results:

There is a very small difference (0.3%). Do we know why that happens? Or is that just noise? (I don't think we need to sink time into address the delta, I'm mainly wondering if this is expected.)

nluehr

comment created time in 12 days

pull request commentnumba/numba

[DISCUSSION] Specify synchronization and lifetime semantics of CUDA Array Interface

cc @sanjoy - Would these modifications to the specification provide a sufficient basis for implementing support for __cuda_array_interface__ in TensorFlow?

Sorry for the delayed response. This went into my personal email account and fell through the cracks.

As written I'm not sure if the spec is complete: I believe there is one (non legacy) "default" stream per thread so "the consumer is expected to operate on the data on the default stream" does not unambiguously refer to one specific stream.

gmarkall

comment created time in 13 days

pull request commenttensorflow/tensorflow

Enable preventing engine build at runtime

This usually happens when TRT can't run at all. The reason could vary from bad GPU or OOM. It could be also because of having multiple GPUs (due to a known issue in TF-TRT). Could you rerun the test with 1 gpu?

My machine does have two GPUs so that's probably it, but I haven't checked.

pooyadavoodi

comment created time in 13 days

issue commenttensorflow/tensorflow

tf.function using higher GPU memory than normal python function

Interesting find!

Can you check forcing the TF to use a smaller amount of memory (using set_local_device_configuration) runs fine (i.e. without OOM) with tf.function?

tf.function uses the graph executor which (grossly simplifying) picks some execution order consistent with the data flow in the graph and this total order could be different enough from the eager execution order to explain the difference in memory usage. However, the TF allocator has a mechanism to (again, oversimplifying) "pick" a different total order to avoid an OOM.

abhigoyal2210

comment created time in 14 days

Pull request review commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

 def supported_precision_modes(): # so it can produce reasonable performance results with the default. DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES = 1 << 30 -# TrtConversionParams encapsulates the parameters that are used for TF-TRT-# conversion.-TrtConversionParams = collections.namedtuple(-    "TrtConversionParams",-    [--        # A template RewriterConfig proto used to create a TRT-enabled-        # RewriterConfig. If None, it will use a default one.-        "rewriter_config_template",--        # The maximum GPU temporary memory which the TRT engine can use at-        # execution time. This corresponds to the 'workspaceSize' parameter of-        # nvinfer1::IBuilder::setMaxWorkspaceSize().-        "max_workspace_size_bytes",--        # One of TrtPrecisionMode.supported_precision_modes().-        "precision_mode",--        # The minimum number of nodes required for a subgraph to be replaced by-        # TRTEngineOp.-        "minimum_segment_size",--        # Whether to generate dynamic TRT ops which will build the TRT network-        # and engine at run time.-        # i.e. Since TensorRT version < 6.0 does not support dynamic dimensions-        # other than the batch dimension, when the TensorFlow graph has a-        # non-batch dimension of dynamic size, we would need to enable this-        # option. This option should be set to True in TF 2.0.-        "is_dynamic_op",--        # Max number of cached TRT engines for dynamic TRT ops.-        # Created TRT engines for a dynamic dimension are cached.-        # This is the maximum number of engines that can be cached.-        # If the number of cached engines is already at max but none of them-        # supports the input shapes, the TRTEngineOp will fall back to run the-        # original TF subgraph that corresponds to the TRTEngineOp.-        "maximum_cached_engines",--        # This argument is ignored if precision_mode is not INT8. If set to-        # True, a calibration graph will be created to calibrate the missing-        # ranges. The calibration graph must be converted to an inference graph-        # by running calibration with calibrate(). If set to False, quantization-        # nodes will be expected for every tensor in the graph (exlcuding those-        # which will be fused). If a range is missing, an error will occur.-        # Please note that accuracy may be negatively affected if there is a-        # mismatch between which tensors TRT quantizes and which tensors were-        # trained with fake quantization.-        "use_calibration",--        # Max size for the input batch.-        # This parameter is only effective when is_dynamic_op=False which-        # is not supported in TF 2.0.-        "max_batch_size",-    ])--DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams(-    rewriter_config_template=None,-    max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,-    precision_mode=TrtPrecisionMode.FP32,-    minimum_segment_size=3,-    is_dynamic_op=True,-    maximum_cached_engines=1,-    use_calibration=True,-    max_batch_size=1)++@tf_export("experimental.tensorrt.ConversionParams", v1=[])+class TrtConversionParams(object):

Let's fix this in a follow-up commit. CC @DEKHTIARJonathan

pooyadavoodi

comment created time in 15 days

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

@rthadur I think it will be easier if I fixed these internally. Can you please import the CL and CC it to me?

Sorry, looks like you already did that. :) Let me see if I can quickly fix these locally.

pooyadavoodi

comment created time in 15 days

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

@rthadur I think it will be easier if I fixed these internally. Can you please import the CL and CC it to me?

pooyadavoodi

comment created time in 15 days

issue commenttensorflow/tensorflow

TF.distribute.MirroredStrategy() crashes

maybe there would be a way tensorflow could test this or give a hint in that direction?

I think it is difficult to go from CUDNN_STATUS_INTERNAL_ERROR to "GPU may be faulty" since there are many other reasons why cuDNN will return this error.

However, maybe we should link to the gpu-burn test from the TF docs as something users should consider? CC @dynamicwebpaige

m4tts

comment created time in 15 days

pull request commenttensorflow/tensorflow

Enable preventing engine build at runtime

I triaged the problem down to OpsTestBase. With this change patched in I now get a different failure from the test.

pooyadavoodi

comment created time in 18 days

pull request commenttensorflow/tensorflow

Horizontal fusion

@thomasjoerg, I will need to bother you to kick off the CI again.

Done.

trentlo

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

[XLA] Change the default SM in XLA.

 const int kAMDGPUInlineThreshold = 0x100000; // Default inline threshold value to use in llvm. const int kDefaultInlineThreshold = 1100; -// Gets the GPU name as it's known to LLVM for a given compute capability.  If-// we see an unrecognized compute capability, we return "sm_35".+// Gets the GPU name as it's known to LLVM for a given compute+// capability.  If we see an unrecognized compute capability, we+// return the highest one that is known and below the selected device. static string GetSmName(std::pair<int, int> compute_capability) {-  static auto* m = new std::map<std::pair<int, int>, int>({-      {{3, 5}, 35},-      {{3, 7}, 37},-      {{5, 0}, 50},-      {{5, 2}, 52},-      {{5, 3}, 53},-      {{6, 0}, 60},-      {{6, 1}, 61},-      {{6, 2}, 62},-      {{7, 0}, 70},-      {{7, 2}, 72},-      {{7, 5}, 75},-  });+  int ccv = compute_capability.first * 10 + compute_capability.second;

Can we use a more descriptive name than ccv?

nouiz

comment created time in 19 days

pull request commenttensorflow/tensorflow

Add multi-algorithm deterministic cuDNN convolutions

@akuegel, oh no! Could you tell me which test target became flaky?

It is an internal target, but AFAICT the target was always flaky. I'm now following up with the team internally.

duncanriach

comment created time in 20 days

Pull request review commenttensorflow/tensorflow

[XLA] Change the default SM in XLA.

 static string GetSmName(std::pair<int, int> compute_capability) {       {{7, 5}, 75},   });   int sm_version = 35;-  auto it = m->find(compute_capability);-  if (it != m->end()) {-    sm_version = it->second;-  } else {+  // If the current compute capability isn't known, fallback to the+  // most recent version before the unknown version.+  for (auto iter = m->begin(); iter != m->end(); ++iter) {

How about std::prev(m->upper_bound(...))? Will that be correct? If upper_bound returns begin then we'll LOG the unknown compute capability error message.

nouiz

comment created time in 20 days

Pull request review commenttensorflow/tensorflow

[XLA] Change the default SM in XLA.

 static string GetSmName(std::pair<int, int> compute_capability) {       {{7, 5}, 75},   });   int sm_version = 35;-  auto it = m->find(compute_capability);-  if (it != m->end()) {-    sm_version = it->second;-  } else {+  // If the current compute capability isn't known, fallback to the+  // most recent version before the unknown version.+  for (auto iter = m->begin(); iter != m->end(); ++iter) {+    auto k = iter->first;+    if ((k.first < compute_capability.first) ||+	(k.first == compute_capability.first &&+	 k.second <= compute_capability.second)) {+      sm_version = iter->second;+    }+  }+  std::cout << "SM " << sm_version;

Debug print? (Feel free to use a VLOG(1) for this.)

nouiz

comment created time in 21 days

Pull request review commenttensorflow/tensorflow

[XLA] Change the default SM in XLA.

 static string GetSmName(std::pair<int, int> compute_capability) {       {{7, 5}, 75},   });   int sm_version = 35;-  auto it = m->find(compute_capability);-  if (it != m->end()) {-    sm_version = it->second;-  } else {+  // If the current compute capability isn't known, fallback to the+  // most recent version before the unknown version.+  for (auto iter = m->begin(); iter != m->end(); ++iter) {

Is this just m->lower_bound(compute_capability)?

nouiz

comment created time in 21 days

Pull request review commenttensorflow/tensorflow

[XLA] Change the default SM in XLA.

 static string GetSmName(std::pair<int, int> compute_capability) {       {{7, 5}, 75},   });   int sm_version = 35;-  auto it = m->find(compute_capability);-  if (it != m->end()) {-    sm_version = it->second;-  } else {+  // If the current compute capability isn't known, fallback to the+  // most recent version before the unknown version.+  for (auto iter = m->begin(); iter != m->end(); ++iter) {+    auto k = iter->first;+    if ((k.first < compute_capability.first) ||

Unnecessary parens.

nouiz

comment created time in 21 days

issue commenttensorflow/tensorflow

device_lib.list_local_devices() InvalidArgumentError: Invalid device ordinal value (1). Valid range is [0, 0].

Can you please share the log file? The original error has now been replaced by a LOG(INFO) line so you're probably seeing a different error.

shun-lin

comment created time in 21 days

issue commenttensorflow/tensorflow

Expand registered kernels for variable ops on GPU

I can. This is my first PR. Will tests be run on servers or do I have to run tests locally?

The PR won't be merged unless all tests pass (this is automatically enforced). However it is usually better to run tests locally so that your PR does not get blocked.

dirktheeng

comment created time in 21 days

issue commenttensorflow/tensorflow

Expand registered kernels for variable ops on GPU

@sanjoy I would prefer to see TF_CALL_GPU_ALL_TYPE on the variableop

Just to be clear, you're talking about the Variable and VariableV2 op, not VarHandleOp?

dirktheeng

comment created time in 22 days

issue commenttensorflow/tensorflow

device_lib.list_local_devices() InvalidArgumentError: Invalid device ordinal value (1). Valid range is [0, 0].

Hi Sanjoy, Thanks for your response. I am running tensorflow 1.14.0

Thank you. I believe this issue has been fixed; can you check if you can reproduce the problem with tf-nightly?

shun-lin

comment created time in 22 days

issue commenttensorflow/tensorflow

Expand registered kernels for variable ops on GPU

Can you be more specific on which ops you want to modify? For instance, I'm not sure if we should be calling TF_CALL_GPU_ALL_TYPES on AssignAddVariableOp since we can't add bools.

dirktheeng

comment created time in 22 days

pull request commenttensorflow/tensorflow

Horizontal fusion

No problem to provide a fix to it. However, @thomasjoerg did you mean someone already fix it and a commit is on the way?

I suspect in " This is a known issue and a fix is underway." @thomasjoerg was referring to "Unfortunately the open-source CI did not catch these test breakages.". I'll ping him internally to reply here, but we're outside his working hours.

trentlo

comment created time in 22 days

pull request commenttensorflow/tensorflow

Horizontal fusion

Got it. Let me know if there is anything I can help with.

Can you fix the CHECK-failures Thomas mentioned above? Or are they already fixed?

trentlo

comment created time in 23 days

issue closedtensorflow/tensorflow

Instantiating Separate Session for Each GPU

I'm trying to speed up Tensorflow inference by using multiple GPUs and instantiating a separate Session for each GPU.

I am using Tensorflow 1.7 and 2 Quadro GV100's for this. The GV100's are device 0 and device 1, respectively.

My C++ code looks something like the following:

auto options0 = SessionOptions(); options0.config.mutable_gpu_options()->set_visible_device_list("0"); NewSession(options0, &m_session0);

auto options1 = SessionOptions(); options1.config.mutable_gpu_options()->set_visible_device_list("1"); NewSession(options1, &m_session1); However, when I execute this code, I get the following error message:

name: Quadro GV100 major: 7 minor: 0 memoryClockRate(GHz): 1.627 pciBusID: 0000:18:00.0 totalMemory: 31.87GiB freeMemory: 31.33GiB 2019-01-07 17:56:09.453901: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1423] Adding visible gpu devices: 0 2019-01-07 17:56:10.196800: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:911] Device interconnect StreamExecutor with strength 1 edge matrix: 2019-01-07 17:56:10.202994: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:917] 0 2019-01-07 17:56:10.206965: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:930] 0: N 2019-01-07 17:56:10.211407: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1041] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 30421 MB memory) -> physical GPU (device: 0, name: Quadro GV100, pci bus id: 0000:18:00.0, compute capability: 7.0) 17:56:11.099: Choosing GPU: 1 2019-01-07 17:56:11.280313: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1344] Found device 0 with properties: name: Quadro GV100 major: 7 minor: 0 memoryClockRate(GHz): 1.627 pciBusID: 0000:3b:00.0 totalMemory: 31.87GiB freeMemory: 31.33GiB 2019-01-07 17:56:11.291274: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:1423] Adding visible gpu devices: 1 2019-01-07 17:56:12.076807: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:911] Device interconnect StreamExecutor with strength 1 edge matrix: 2019-01-07 17:56:12.082561: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:917] 1 2019-01-07 17:56:12.088587: I C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_device.cc:930] 1: N 2019-01-07 17:56:12.095142: F C:\tensorflow_1_7\tensorflow\tensorflow\core\common_runtime\gpu\gpu_id_manager.cc:45] Check failed: cuda_gpu_id.value() == result.first->second (1 vs. 0)Mapping the same TfGpuId to a different CUDA GPU id. TfGpuId: 0 Existing mapped CUDA GPU id: 0 CUDA GPU id being tried to map to: 1

My question is, what is the proper way to assign/dedicate a GPU to each Tensorflow session?

Thank you very much for your help in advance!

closed time in 23 days

GothamCityPro

issue commenttensorflow/tensorflow

Instantiating Separate Session for Each GPU

Closing for lack of activity. If this is still blocking you please reopen with a reproducer.

GothamCityPro

comment created time in 23 days

issue commenttensorflow/tensorflow

device_lib.list_local_devices() InvalidArgumentError: Invalid device ordinal value (1). Valid range is [0, 0].

@shun-lin Can you try running with tf-nightly? The codepath that was causing the original error has been fixed.

@maximuslee1226 What version of TF are you using?

shun-lin

comment created time in 23 days

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

The windows build is failing: https://source.cloud.google.com/results/invocations/1eb40532-983f-47e5-b172-f91a7c7f842c/targets/%2F%2Ftensorflow%2Ftools%2Fci_build%2Fbuilds:gen_win_out/log

ERROR: T:/src/github/tensorflow/tensorflow/python/keras/api/BUILD:116:1: Executing genrule //tensorflow/python/keras/api:keras_python_api_gen_compat_v1 failed (Exit 1)
Traceback (most recent call last):
  File "\\?\T:\tmp\Bazel.runfiles_aoh3x156\runfiles\org_tensorflow\tensorflow\python\tools\api\generator\create_python_api.py", line 27, in <module>
    from tensorflow.python.tools.api.generator import doc_srcs
  File "\\?\T:\tmp\Bazel.runfiles_aoh3x156\runfiles\org_tensorflow\tensorflow\python\__init__.py", line 85, in <module>
    from tensorflow.python.ops.standard_ops import *
  File "\\?\T:\tmp\Bazel.runfiles_aoh3x156\runfiles\org_tensorflow\tensorflow\python\ops\standard_ops.py", line 115, in <module>
    from tensorflow.python.compiler.tensorrt import trt_convert_windows as trt
  File "\\?\T:\tmp\Bazel.runfiles_aoh3x156\runfiles\org_tensorflow\tensorflow\python\compiler\tensorrt\trt_convert_windows.py", line 30, in <module>
    class TrtConversionParams(object):
  File "\\?\T:\tmp\Bazel.runfiles_aoh3x156\runfiles\org_tensorflow\tensorflow\python\compiler\tensorrt\trt_convert_windows.py", line 35, in TrtConversionParams
    max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
NameError: name 'DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES' is not defined
Target //tensorflow/tools/pip_package:build_pip_package failed to build
ERROR: T:/src/github/tensorflow/tensorflow/python/tools/BUILD:98:1 Executing genrule //tensorflow/python/keras/api:keras_python_api_gen_compat_v1 failed (Exit 1)
INFO: Elapsed time: 1336.958s, Critical Path: 546.13s
INFO: 5660 processes: 3285 remote cache hit, 2375 local.
FAILED: Build did NOT complete successfully
FAILED: Build did NOT complete successfully
pooyadavoodi

comment created time in 25 days

Pull request review commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

 def __init__(self,                input_saved_model_dir=None,                input_saved_model_tags=None,                input_saved_model_signature_key=None,-               conversion_params=DEFAULT_TRT_CONVERSION_PARAMS):+               conversion_params=TrtConversionParams()):

We can't change the default value to None due to backward compatibility.

I may be missing some Python subtleties, but can you default it to None and then within the function do

if conversion_params is None:
  conversion_params = TrtConversionParams()

?

pooyadavoodi

comment created time in a month

issue commenttensorflow/tensorflow

Error polling for event status: failed to query event: CUDA_ERROR_UNKNOWN during training process

unfortunately it freezes my pc then the system reboots when it does occur. I guess the GPU crashes?

That sounds like a kernel / driver issue to me, not something specific to TF (IIUC TF should not be able to crash the machine). Can you bring this up on NVIDIA's support forums?

cristiangogosila

comment created time in a month

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

Looks like when I run bazel test tensorflow/tools/api/tests:api_compatibility_test, it fails similar to github:

Did you figure out why it wasn't failing in https://github.com/tensorflow/tensorflow/pull/35198#issuecomment-575753807?

Other than that, I'm waiting for @alextp here.

pooyadavoodi

comment created time in a month

issue commenttensorflow/tensorflow

Error polling for event status: failed to query event: CUDA_ERROR_UNKNOWN during training process

Thank you for the offer sanjoy. Do you know if tensorflow save error logs like this somewhere?

Just what it prints out to stderr.

For extra points you could also set the environment variable TF_CPP_MIN_VLOG_LEVEL to 1 (used here) which will turn on verbose logging. However, I'm not sure if the logging output will remain small enough to be uploaded with this set.

cristiangogosila

comment created time in a month

issue commenttensorflow/tensorflow

Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.

Unfortunately I don't have easy access to an RTX 2070 so I cannot reproduce this error.

You said

And the same script runs on the same machine on other TensorFlow builds/versions.

What other builds did you try? And does this work on tf-nightly?

abcdabcd987

comment created time in a month

Pull request review commenttensorflow/tensorflow

[SEGFAULT]: root_node nullptr check

 bool HloParserImpl::ParseInstructionList(HloComputation** computation,     // the pool, which should not happen.     if (root_node == nullptr) {       LOG(FATAL) << "instruction " << root_name-                 << " was marked as ROOT but the parser has not seen it before";+                 << " was marked as ROOT but the parser has not seen it before"; // LOG(FATAL) crashes the program by calling abort()

Final comment: please end the sentence with period.

gaurav1086

comment created time in a month

Pull request review commenttensorflow/tensorflow

[SEGFAULT]: root_node nullptr check

 bool HloParserImpl::ParseInstructionList(HloComputation** computation,     // the pool, which should not happen.     if (root_node == nullptr) {       LOG(FATAL) << "instruction " << root_name-                 << " was marked as ROOT but the parser has not seen it before";+                 << " was marked as ROOT but the parser has not seen it before"; // abort()

Sorry for being so pedantic, but can you please use a full english sentence here? // abort() reads like code that was accidentally commented out.

gaurav1086

comment created time in a month

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

@annarev we have a case where ApiCompatibilityTest.testAPIBackwardsCompatibilityV2 is failing in google3 after the import but passing in open source. Any idea what's going on?

Sorry just saw the comment. Is this still an issue? One problem could be if the goldens need to be updated both for TF 1.x and TF 2.x versions.

Hi @annarev, I think this is still an issue, see https://github.com/tensorflow/tensorflow/pull/35198#issuecomment-575753807 ?

pooyadavoodi

comment created time in a month

issue commenttensorflow/tensorflow

Error polling for event status: failed to query event: CUDA_ERROR_UNKNOWN during training process

I have just encountered the same bug whilst training a custom GAN built with the tensorflow framework. It also occured after a few hundred epochs and the loss function+optimiser I used are the same as christiangogosilas. Rtx 2070super, python 3.7.4, tensorflow installed using anaconda.

Did you notice anything suspicious in the logs? If you link to them from here I'm happy to take a look as well.

cristiangogosila

comment created time in a month

pull request commenttensorflow/tensorflow

[SEGFAULT]: root_node nullptr check

@sanjoy , that makes sense. Imho we should both write the comment as well as add 'return false' statement.

I don't think we should also add the return false; (which will be dead code).

I'm not convinced by your argument that if someone hypothetically removes / changes LOG(FATAL) in the future then the return false; will help us avoid a crash. I don't think code needs to be hardened against arbitrary mutations around it :), it will be the responsibility for whoever changed the LOG(FATAL) to also make sure we do the right thing for a null root_node. For instance, in the future someone could remove the entire if (root_node == nullptr) { clause, but I don't think it is productive to modify the rest of the code so that it keeps working with that mutation.

gaurav1086

comment created time in a month

pull request commenttensorflow/tensorflow

[SEGFAULT]: root_node nullptr check

@sanjoy , If someone in the future by mistake removes the "LOG(FATAL) << msg" statement or changes the log level from FATAL to something else, it might crash the code by accessing the root_node(nullptr). So good to have these additional checks (without undermining performance). However it's obviously a choice.

But there are cons to adding the return false; as well. For instance if someone does not know LOG(FATAL) crashes , they might assume that ParseInstructionList legitimately returns false if root_node is nullptr.

IMO the right fix here is to add a clear comment that LOG(FATAL) crashes so the immediately following deref of root_node is OK. What do you think?

gaurav1086

comment created time in a month

pull request commenttensorflow/tensorflow

[SEGFAULT]: root_node nullptr check

However, I think having this change wouldn't hurt as being good coding practice.

Can you share why you think this is good coding practice?

gaurav1086

comment created time in a month

pull request commenttensorflow/tensorflow

Add support to cuDNN CTC loss

Thanks for checking. We should be good. I'm looking at why it hasn't merged.

It was waiting for an approval from me for some reason. Should be good to go now.

houtoms

comment created time in a month

pull request commenttensorflow/tensorflow

Enable preventing engine build at runtime

bazel test is not showing the stack trace. Do you know how I can force it to show the stack trace?

You could try running the program under GDB and put a breakpoint on the constructor of std::bad_function_call to see where it is constructed.

pooyadavoodi

comment created time in a month

pull request commenttensorflow/tensorflow

Enable preventing engine build at runtime

https://en.cppreference.com/w/cpp/utility/functional/bad_function_call says "std::bad_function_call is the type of the exception thrown by std::function::operator() if the function wrapper has no target.". So probably we have an uninitialized std::function somewhere. Can you get the stack trace from where the exception is thrown?

pooyadavoodi

comment created time in a month

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple and export it

Passing the buck to @alextp -- Alex, can you PTAL if your comments in https://github.com/tensorflow/tensorflow/pull/35198#issuecomment-573832904 have been addressed?

pooyadavoodi

comment created time in a month

issue commentNVIDIA/nccl

Potential memory leak in graph/paths.cc

CC @chsigg @dubey

sanjoy

comment created time in a month

issue openedNVIDIA/nccl

Potential memory leak in graph/paths.cc

We are running into a memory leak with NCCL v2.5.6-2 and our current theory is that this is a bug in graph/paths.cc.

The leaked memory is allocated here. IIUC the memory is supposed to be freed by ncclTopoRemovePathType, but that is not happening because it only frees paths from all nodes into nodes of type nodeType, but it should be freeing paths from all nodes of type nodeType as well.

Changing ncclTopoRemovePathType to

static void ncclTopoRemovePathType(struct ncclTopoSystem* system,
                                   int nodeType) {
  for (int t = 0; t < NCCL_TOPO_NODE_TYPES; t++) {
    for (int n = 0; n < NCCL_TOPO_MAX_NODES; n++) {
      struct ncclTopoNode* node = system->nodes[t].nodes + n;
      free(node->paths[nodeType]);
      node->paths[nodeType] = NULL;
    }
  }

  for (int n = 0; n < NCCL_TOPO_MAX_NODES; n++) {
    struct ncclTopoNode* node = system->nodes[nodeType].nodes + n;
    free(node->paths[nodeType]);
    node->paths[nodeType] = NULL;
  }
}

fixes the leak.

I originally had the second loop loop for system->nodes[nodeType].count iterations but that doesn't work because sometimes we set the count for NET nodes to 0 here but we still have link->remNode point to NET nodes which causes us to allocate into paths for NET nodes.

To reproduce upgrade TF to use NCCL 2.6.5-2 and then run

$ bazel test --config=cuda tensorflow/core/nccl:nccl_manager_test_gpu --test_filter='NcclManagerTest/0.MultiNodeSingle'

although this test might not fail as-is since the leak checker we use internally might not be available in open source.

created time in a month

Pull request review commenttensorflow/tensorflow

Some C++ fixes

 void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,         InvalidArgument("Attribute '", attr_name, "' is not a string");     return;   }-  if (max_length <= 0) {+  if (max_length == 0) {+    status->status = InvalidArgument("Attribute '", max_length, "' is zero");

I don't think this is correct. max_length is the length value and so if it is 0 then there is nothing to do here.

gaurav1086

comment created time in a month

Pull request review commenttensorflow/tensorflow

Some C++ fixes

 void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,         InvalidArgument("Attribute '", attr_name, "' is not a string");     return;   }-  if (max_length <= 0) {+  if (max_length == 0) {+    InvalidArgument("Attribute '", max_length, "' is zero");

This is not the correct way to use InvalidArgument -- it returns a value. And what if max_length is less than 0?

gaurav1086

comment created time in a month

Pull request review commenttensorflow/tensorflow

Some C++ fixes

 class NotPredicate : public Predicate { class AndRecurrencePredicate : public Predicate {  public:   explicit AndRecurrencePredicate(int64 id, Predicate* start, Predicate* step,-                                  std::vector<string> frame)+                                  std::vector<string> &frame)       : Predicate(id), operands_({start, step}), frame_(std::move(frame)) {}

It was intentional to take frame by value. That way if the caller passed a temporary then we don't copy anything. I suspect the form you're suggesting will incur a copy.

gaurav1086

comment created time in a month

pull request commenttensorflow/tensorflow

Fix build breakage due to missing static member definitions

Thanks. This isn't needed in c++17, but is needed in c++14 so your fix is correct.

bas-aarts

comment created time in a month

pull request commenttensorflow/tensorflow

Fix build breakage due to missing static member definitions

I'm not sure if this should be necessary or points to a problem somewhere else.

@timshen91 do you know what to expect here?

bas-aarts

comment created time in a month

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple

@alextp had this comment in the internal version of this CL:

(for tf-api-owners)

  1. It looks like the default argument is now mutable. This is dangerous as assigning to it can have unexpected consequences to unrelated code. Preserving immutability is a good idea (you can use patterns like .withchange(change) methods which return copies of the tuple).

  2. It doesn't look like the changed type was exposed in the public API. If so, how are users supposed to construct it to pass as arguments to that function?

pooyadavoodi

comment created time in a month

issue commenttensorflow/tensorflow

Why the communication of distributed training not hidden when using XLA?

@zhuhong realistically, I won't have time for this anytime soon. Can you try using the TensorBoard profiler on your model to see something obvious shows up?

zhuhong

comment created time in a month

pull request commenttensorflow/community

RFC: DLpack support for interoperability with other GPU frameworks

The next action item is to evaluate whether the DLPack interface can and should be implemented using the TFE_NewTensorHandleFromDeviceMemory, TFE_TensorHandleDeviceMemorySize, TFE_TensorHandleDevicePointer and a hypothetical TBD TFE_SynchronizeDevice APIs. Once we have made a decision on that, we can merge this RFC and start reviewing PRs.

Design review meeting notes

Sanjoy: 3 questions, (1) what about streams? Resolution is that frameworks need to sync their streams at the boundaries

Minjie: we can make an official document from DLPack about what should happen around streams when handing tensors around

(2) subclassing TensorBuffer vs using new allocator. Minjie: yes, we plan on doing this.

(3) around using DT_VARIANT to represent DLManagedTensor. Apparently there's no easy way.

Minjie: no easy to way to pass a pointer from python to TensorFlow. Alex: we can use TFE_NewTensorHandleFromDeviceMemory. Minjie: when will this make it into a final release? Alex: it's not in 2.1, will be in 2.2. Minjie: then we might need an interim solution until then.

Sanjoy: but either way this feature won't be available in core TF till the 2.2 release. And in the interim we have the tf-dlpack out of tree implementation.

Agarwal: is alignment a concern? Minjie: the tf to dlpack layer needs to copy the memory to ensure alignment.

Even Oldridge: can we throw an error on unaligned things? Alex: yes, we already get errors.

Agarwal: beware of int32 tensors and the host memory problems. Followed by a short discussion of the problems.

Sanjoy: should we mark any new ops as experimental? Alex: yes, any new python APIs should probably be experimental for at least one release.

Sanjoy: what about cuda_array_interface? It looks like the device memory tensor handle API would be sufficient to implement that. We also need a C API to synchronize GPU streams.

Minjie: if TF exposed its streams in the API then integration problems would be made simpler. Maybe TF should just use the default stream? Sanjoy: how would we represent this in the API? Minjie: maybe a string opaque stream ID? Sanjoy: need to make this work across cuda / rocm so hopefully some generic representation is possible.

Wolff: how can we make sure users actually know about this? Resolution: general brainstorm about using TF twitter, dgl twitter, rapids twitter, etc.

EvenOldridge

comment created time in a month

pull request commenttensorflow/tensorflow

Enable tf.nn.bias_add python op tests to work in eager mode (as well as graph mode)

I'll ping @chsigg internally to see if he can do another review or I'll find another reviewer.

duncanriach

comment created time in a month

pull request commenttensorflow/tensorflow

Change TrtConversionParams to class from NamedTuple

@annarev we have a case where ApiCompatibilityTest.testAPIBackwardsCompatibilityV2 is failing in google3 after the import but passing in open source. Any idea what's going on?

pooyadavoodi

comment created time in a month

Pull request review commenttensorflow/tensorflow

Fix saved_model_cli tensorrt conversion

 def create_parser():       'tensorrt',       description='Convert the SavedModel with Tensorflow-TensorRT integration',       formatter_class=argparse.RawTextHelpFormatter)-  parser_convert_with_tensorrt.add_argument(-      '--max_batch_size',-      type=int,-      default=1,-      help='max size for the input batch')

Are we replacing this argument with something else? Or is it not needed anymore?

wdirons

comment created time in a month

issue commenttensorflow/tensorflow

Sudden drop in conv3d_transpose GPU performance with large input sizes

@timshen91 PTAL. Maybe we can extract out a test case for cuDNN that demonstrates this issue (assuming TF is using cuDNN correctly)? /CC @nluehr

mauriceqch

comment created time in a month

issue commentnumba/numba

Discussion: should __cuda_array_interface__ include a stream pointer?

I'm fine with documenting that it is up to the user to insert the necessary synchronization.

Personally, I think this is a tricky invariant for a framework user to maintain (for reasons I mentioned above) but ultimately this is something that's up to the the authors for __cuda_array_interface__ to decide.

leofang

comment created time in a month

issue closedtensorflow/tensorflow

TFLite delegates/gpu/libmetal_delegate.a is missing

<em>Please make sure that this is a build/installation issue. As per our GitHub Policy, we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:build_template</em>

System information

  • OS Platform and Distribution (e.g., Linux Ubuntu 16.04): macOS Mojave (10.14.1)
  • Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device: N/A
  • TensorFlow installed from (source or binary): Not installed.
  • TensorFlow version: master branch (latest commit 6353d940289a225cfbc104cc647b3c6970077faa)
  • Python version: 3.7.2
  • Installed using virtualenv? pip? conda?: Not installed, just calling shell scripts from repo
  • Bazel version (if compiling from source): 0.18.0
  • GCC/Compiler version (if compiling from source): N/A
  • CUDA/cuDNN version: N/A
  • GPU model and memory: N/A

Describe the problem Attempting to build iOS Metal TFLite Delegate with create_ios_frameworks.sh yields the following error:

$ tensorflow/lite/lib_package/create_ios_frameworks.sh -g
Starting
File /path/to/tensorflow/tensorflow/lite/lib_package/../delegates/gpu/libmetal_delegate.a doesn't exist.
It's requried for building TFLite Framework with GPU. Aborting.

Provide the exact sequence of commands / steps that you executed before running into the problem

This is how I am attempting to create the iOS framework:

$ tensorflow/lite/tools/make/download_dependencies.sh
$ tensorflow/lite/tools/make/build_ios_universal_lib.sh
$ tensorflow/lite/lib_package/create_ios_frameworks.sh -g

I saw that the -g flag got added to create_ios_frameworks.sh in 59d535a0df17eaf3033bbff73ef4e1e1988c454e. Without the flag, I am able to successfully build the framework but, as expected, the GPU is not utilized.

I know the GPU delegates only got open-sourced a couple of days ago, and before that I had unsurprisingly been getting the same error but with metal_delegate.h, which was added in fb772b781b011471dec443e1f3cd6b664958b767. Is libmetal_delegate.a supposed to be present or is it still pending open-sourcing?

Any other info / logs Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.

N/A

closed time in a month

willbattel

issue commenttensorflow/tensorflow

TFLite delegates/gpu/libmetal_delegate.a is missing

Could you tell me how to fix it ? thx a lot

Looks like this is a question for @freedomtan ?

@freedomtan thank you, your solution did work. After modifying metal_delegate.mm I was able to build libmetal_delegate.a which allowed create_ios_frameworks.sh to run with the -g flag.

This issue should probably remain open until this workaround is no longer necessary.

The workaround is no longer necessary so I'm closing the issue.

willbattel

comment created time in a month

issue commenttensorflow/tensorflow

Running tensorflow on GPU is far slower than on CPU

Additionally, you can also try to use the TensorBoard profiler to see if that points to any red flags.

forReason

comment created time in a month

issue commenttensorflow/tensorflow

tf.Estimator starts with GPU and switches to CPU

Hi @arozans,

I would suggest poking at your training job with the TensorBoard profiler to see if you can spot any red flags. You can using the profiler APIs to profile the program for a few steps only after step 4000 (since that is when the problem starts).

arozans

comment created time in a month

issue commenttensorflow/tensorflow

Error polling for event status: failed to query event: CUDA_ERROR_UNKNOWN during training process

This happens sometimes when i am not connected to that machine. For some reason, when i am working on that machine, i'm not getting that error.

Are you sure this is a problem with TensorFlow? This aspect of the problem makes it sound like a systemic issue (e.g. perhaps disconnecting from the machine shuts down the GPU?).

cristiangogosila

comment created time in a month

pull request commenttensorflow/tensorflow

Enable preventing engine build at runtime

Thanks @sanjoy. Linking changed the error. Now I am looking at another problem.

Looks like this isn't ready for review so removing the tag. @pooyadavoodi please LMK if this is indeed ready for review.

pooyadavoodi

comment created time in a month

issue commenttensorflow/tensorflow

DatasetVariantWrapper "No unary variant device copy function found"

Do you know how is this achieved?

Probably placer needs to understand and respect this constraint. Maybe @iganichev knows?

mwalmsley

comment created time in 2 months

issue commenttensorflow/tensorflow

Not able to build GPU custom op example

Relevant comment from @Artem-B: https://github.com/tensorflow/tensorflow/issues/34428#issuecomment-564281533

alexminnaar

comment created time in 2 months

issue commenttensorflow/tensorflow

Failed to get device properties, error code: 30

I commented under issue #26255 but the original poster closed the issue as his problem was solved by updating to tensorflow 2. ... I am opening a new issue because updating to the pre-release is not an option

Can you check if upgrading to TF 2 helps now that TF2 is no longer pre-release?

but the current behavior is that an error has occurred and the GPU is in an unknown state, the error is logged but the application is unaware and continues to try to use the GPU with no result. At a minimum I think tensorflow should throw an exception so that the application can inform the user and cleanup.

If you need this behavior perhaps you could explicitly do assert tf.test.is_gpu_available()?

stevehawley

comment created time in 2 months

issue closedtensorflow/tensorflow

Estimator training hangs in multiple gpu if dataset doesn't have enough element to feed both gpus last batches

System information

  • Have I written custom code (as opposed to using a stock example script provided in TensorFlow): YES
  • OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Distributed training (one node, Multiple GPUs)
  • TensorFlow installed from (source or binary): PIP
  • TensorFlow version (use command below): TF 1.12
  • Python version: 3.6.8
  • CUDA/cuDNN version: 9.0
  • GPU model and memory: 2 GTX1080 8Go

Describe the current behavior Basically, if the dataset doesn't have enough elements to feed both gpus last batches the training hangs.

  • If you doesn't have enough to feed the first gpu last batch and don't want to drop the last batch then the training hangs.
  • If you doesn't have enough to feed the first gpu last batch and want to drop the last batch then you're fine.
  • If you have enough to feed the first gpu last batch but not the second gpu last batch and don't want to drop the last batch then the training hangs
  • If you have enough to feed the first gpu last batch but not the second gpu last batch and want to drop the last batch then the training hangs

Describe the expected behavior

  • If you doesn't have enough to feed the first gpu last batch and don't want to drop the last batch then run the first gpu partial batch and do nothing with the second gpu
  • If you doesn't have enough to feed the first gpu last batch and want to drop the last batch then drop the last batch for both gpus.
  • If you have enough to feed the first gpu last batch but not the second gpu last batch and don't want to drop the last batch then run the first gpu entire batch and run the second gpu partial batch.
  • If you have enough to feed the first gpu last batch but not the second gpu last batch and want to drop the last batch then run the first gpu entire batch and do nothing with the second gpu

Code to reproduce the issue

import tensorflow as tf

# Play with sample count (5, 6, 7) and drop_remainder (True, False) to reproduce the issue
sample_count = 5
drop_remainder = False


def run():
    # Config
    run_config = tf.estimator.RunConfig(
        session_config=tf.ConfigProto(allow_soft_placement=True),
        train_distribute=tf.contrib.distribute.MirroredStrategy(num_gpus=2))

    # Estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
    estimator.train(train_input_fn)


# Times two dataset
def train_input_fn():
    return tf.data.Dataset \
        .range(sample_count) \
        .repeat(1) \
        .map(lambda x: (x, x * 2)) \
        .batch(2, drop_remainder)


# Times two model
def model_fn(features, labels, mode):
    input_layer = tf.cast(tf.reshape(features, [-1, 1]), tf.float32)
    expected_output = tf.cast(tf.reshape(labels, [-1, 1]), tf.float32)

    logit = tf.layers.dense(input_layer, 1, None, False)
    loss = tf.losses.mean_squared_error(expected_output, logit)

    logging_hook = tf.train.LoggingTensorHook(tensors={"feature_value": features.name}, every_n_iter=1)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(0.001)
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          training_hooks=[logging_hook])


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.DEBUG)
    run()

Other info / logs Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.

INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distribute Coordinator.
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmp0ofz0qx1
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_device_fn': None, '_experimental_distribute': None, '_task_type': 'worker', '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_distribute_coordinator_mode': None, '_service': None, '_save_summary_steps': 100, '_model_dir': '/tmp/tmp0ofz0qx1', '_master': '', '_keep_checkpoint_max': 5, '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x7feedf9f5dd8>, '_protocol': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
, '_is_chief': True, '_num_worker_replicas': 1, '_global_id_in_cluster': 0, '_evaluation_master': '', '_log_step_count_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7feedf9f5e48>, '_eval_distribute': None, '_num_ps_replicas': 0}
2019-02-28 11:18:22.645478: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-02-28 11:18:22.818624: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-02-28 11:18:22.820057: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: 
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.847
pciBusID: 0000:01:00.0
totalMemory: 7.90GiB freeMemory: 7.11GiB
2019-02-28 11:18:22.954140: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-02-28 11:18:22.955822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 1 with properties: 
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.847
pciBusID: 0000:02:00.0
totalMemory: 7.93GiB freeMemory: 7.81GiB
2019-02-28 11:18:22.957142: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1
2019-02-28 11:18:23.349095: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-02-28 11:18:23.349133: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 1 
2019-02-28 11:18:23.349139: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N Y 
2019-02-28 11:18:23.349143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1:   Y N 
2019-02-28 11:18:23.349775: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/device:GPU:0 with 6853 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1)
2019-02-28 11:18:23.350097: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/device:GPU:1 with 7535 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1)
INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_CPU:0
INFO:tensorflow:Configured nccl all-reduce.
2019-02-28 11:18:23.372783: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1
2019-02-28 11:18:23.373002: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-02-28 11:18:23.373032: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 1 
2019-02-28 11:18:23.373038: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N Y 
2019-02-28 11:18:23.373043: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1:   Y N 
2019-02-28 11:18:23.373272: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6853 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1)
2019-02-28 11:18:23.373346: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 7535 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:batch_all_reduce invoked for batches size = 1 with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
2019-02-28 11:18:23.707824: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1
2019-02-28 11:18:23.707940: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-02-28 11:18:23.707963: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 1 
2019-02-28 11:18:23.707967: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N Y 
2019-02-28 11:18:23.707988: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1:   Y N 
2019-02-28 11:18:23.708250: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6853 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1)
2019-02-28 11:18:23.708475: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 7535 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1)
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp0ofz0qx1/model.ckpt.
INFO:tensorflow:loss = 47.902126, step = 0
INFO:tensorflow:feature_value = [2 3]

closed time in 2 months

Silb78dg

issue commenttensorflow/tensorflow

Estimator training hangs in multiple gpu if dataset doesn't have enough element to feed both gpus last batches

@rxsang mentioned that this is fixed in TF 2 so closing the issue.

Silb78dg

comment created time in 2 months

issue closedtensorflow/tensorflow

Trained model inference on GPU of nvidia TX2 get poor result even error result

<em>Please make sure that this is a bug. As per our GitHub Policy, we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:bug_template</em>

System information

  • Have I written custom code (as opposed to using a stock example script provided in TensorFlow):
  • OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Linux Ubuntu 16.04
  • Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device:nvidia TX2
  • TensorFlow installed from (source or binary):binary from https://nvidia.box.com/v/JP33-TF1-11-0-py35-wTRT
  • TensorFlow version (use command below):1.110
  • Python version:3.5
  • Bazel version (if compiling from source):
  • GCC/Compiler version (if compiling from source):
  • CUDA/cuDNN version:CUDA9.0 cudnn 7.15
  • GPU model and memory:8G

You can collect some of this information using our environment capture script You can also obtain the TensorFlow version with python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"

Describe the current behavior I trained the model on the server and deployed the same version of tensorflow on TX2, but when I run the trained model with the GPU on TX2, I get a lot worse than on the server, but running the model on the CPU of TX2 does not cause this problem. Describe the expected behavior

  1. The result of running on the GPU of the server should be the same as the result of the GPU running on TX2. There should not be such a big gap.
  2. The GPU running result on TX2 should be the same as the CPU running result on TX2. Code to reproduce the issue Provide a reproducible test case that is the bare minimum necessary to generate the problem.

Other info / logs Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.

closed time in 2 months

xuefengxiaoyang

issue commenttensorflow/tensorflow

Trained model inference on GPU of nvidia TX2 get poor result even error result

Closing since it looks like this is resolved as per https://github.com/tensorflow/tensorflow/issues/26310#issuecomment-541312223

xuefengxiaoyang

comment created time in 2 months

issue commenttensorflow/tensorflow

tf-gpu==1.13.1 : 35% less batch size before OOM vs tf-gpu==1.11.0

@timshen91 Can you please take a look at this? I think the next steps are:

  1. Verify that this is a problem still.
  2. If it is still a problem, file a bug with NVIDIA (CC @nluehr )
iperov

comment created time in 2 months

issue commenttensorflow/tensorflow

GPU is idle even when there are operations ready to be executed

Closing as per https://github.com/tensorflow/tensorflow/issues/25724#issuecomment-522719673: this is a rare issue that has a workaround.

xilenteyex

comment created time in 2 months

issue closedtensorflow/tensorflow

GPU is idle even when there are operations ready to be executed

<em>Please make sure that this is a bug. As per our GitHub Policy, we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:bug_template</em>

System information

  • Have I written custom code (as opposed to using a stock example script provided in TensorFlow): Yes
  • OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Ubuntu 18.04.1 LTS
  • TensorFlow installed from (source or binary): source
  • TensorFlow version (use command below): 1.12.0
  • Python version: Python 2.7.15rc1
  • Bazel version (if compiling from source): 0.21
  • GCC/Compiler version (if compiling from source): (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0
  • CUDA/cuDNN version: 10.0/7.4.2
  • GPU model and memory: NVIDIA Corporation GP100GL [Tesla P100 PCIe 12GB] (2 GPUs)

You can collect some of this information using our environment capture script You can also obtain the TensorFlow version with python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"

Describe the current behavior I am running a toy matrix multiplication example (code here and timeline can be found here). If u look at the timeline, operations named MatMul_1 and MatMul_2 are placed on GPU1 while all other matrix multiplications are placed on GPU0. Operation named MatMul_2 takes input from operations named MatMul and MatMul_1. MatMul and MatMul_1 are completed almost at the same time while there is a large gap after completion of these operations before the execution of MatMul_2 starts. I am not sure why is this gap there ?

Describe the expected behavior Expected behavior according to my understanding is that as soon as MEMCPYPtoP has completed (Op named MatMul is placed on GPU0), excution of MatMul_2 should be started. Is this a performance issue or am I missing something here ?

Code to reproduce the issue toy_matmul code

Other info / logs timeline for toy_matmul Following is the snapshot of the timeline using chrome-tracing visualizer. screen shot 2019-02-13 at 1 02 32 pm

closed time in 2 months

xilenteyex

issue commenttensorflow/tensorflow

tf.dataset + tf.estimator slow, starving CPU/GPU

For these kind of performance issues, it is most productive if you poke at the TensorBoard Profile to see if can spot any bottlenecks.

mathlf2015

comment created time in 2 months

issue closedtensorflow/tensorflow

Check failed: CUDA_SUCCESS == cuCtxSetCurrent(cuda_context->context()) (0 vs. 4)

<em>Please make sure that this is a bug. As per our GitHub Policy, we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:bug_template</em>

System information

  • Have I written custom code (as opposed to using a stock example script provided in TensorFlow): Yes
  • OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Ubuntu 18.04
  • Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device: No
  • TensorFlow installed from (source or binary): No, compiled from Source
  • TensorFlow version (use command below): 1.12
  • Python version: 2.7
  • Bazel version (if compiling from source): Bazel release 0.17.2
  • GCC/Compiler version (if compiling from source): gcc (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0
  • CUDA/cuDNN version: Cuda 10.0/ cuDNN 7.3.1
  • GPU model and memory: Nvidia GeForce RTX 2080 (Two gpus) each with 8 GM memory

Describe the current behavior

I've compiled tensorflow using the following command to get libtensorflow_cc.so. (Non Monolithic build) bazel build --config=cuda //tensorflow:libtensorflow_cc.so

I would like to use multiple gpus at inference time in C++. Currently I have two GeForce RTX 2080 GPUs. So I'm running two threads (with thread id 0 and 1) in a c++ standalone example. Currently I'm using label_image (main.cc).

I want to have two tensorflow session, one session for each thread using each gpu. Thread with thread_id 0 uses gpu:0 and thread with thread_id 1 uses gpu:1

My load_graph method looks as follows:

Status LoadGraph(const string& graph_file_name,
                 std::shared_ptr<tensorflow::Session>* session, int i_threadid) {
  tensorflow::GraphDef graph_def;
  printf(".. IN load graph %d\n", i_threadid);
  Status load_graph_status =
      ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
  if (!load_graph_status.ok()) {
    return tensorflow::errors::NotFound("Failed to load compute graph at '",
                                        graph_file_name, "'");
  }
  tensorflow::SessionOptions session_options;
  if(i_threadid == 0) {
    session_options.config.mutable_gpu_options()->set_visible_device_list("0");
    tensorflow::graph::SetDefaultDevice("/device:GPU:0", &graph_def);
  } else if(i_threadid == 1) {
    session_options.config.mutable_gpu_options()->set_visible_device_list("0,1");
    tensorflow::graph::SetDefaultDevice("/device:GPU:1", &graph_def);
  }
  session_options.config.mutable_gpu_options()->set_allow_growth(true);
  session_options.config.set_allow_soft_placement(true);
  session->reset(tensorflow::NewSession(session_options));

  //tensorflow::graph::SetDefaultDevice("/cpu:0", &graph_def);
  Status session_create_status = (*session)->Create(graph_def);
  if (!session_create_status.ok()) {
    return session_create_status;
  }
  return Status::OK();
}

In the else part: if I use the following line:
session_options.config.mutable_gpu_options()->set_visible_device_list("1");, I'm getting the following error:

2019-01-26 21:47:00.872434: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
2019-01-26 21:47:01.130371: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.8
pciBusID: 0000:17:00.0
totalMemory: 7.77GiB freeMemory: 7.62GiB
2019-01-26 21:47:01.130396: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-01-26 21:47:01.388128: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-01-26 21:47:01.388163: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 
2019-01-26 21:47:01.388170: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N 
2019-01-26 21:47:01.388331: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7337 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:17:00.0, compute capability: 7.5)
2019-01-26 21:47:01.632637: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.8
pciBusID: 0000:65:00.0
totalMemory: 7.76GiB freeMemory: 7.47GiB
2019-01-26 21:47:01.632673: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 1
2019-01-26 21:47:01.898614: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-01-26 21:47:01.898649: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      1 
2019-01-26 21:47:01.898655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1:   N 
2019-01-26 21:47:01.898776: E tensorflow/core/common_runtime/session.cc:64] Failed to create session: Already exists: TensorFlow device (GPU:0) is being mapped to multiple CUDA devices (1 now, and 0 previously), which is not supported. This may be the result of providing different GPU configurations (ConfigProto.gpu_options, for example different visible_device_list) when creating multiple Sessions in the same process. This is not  currently supported, see https://github.com/tensorflow/tensorflow/issues/19083
Segmentation fault (core dumped)

So I added session_options.config.mutable_gpu_options()->set_visible_device_list("0,1"); so the second thread with thread_1 is creating session that uses both GPUs (or atleast initializing to utilize both gpus) but the graph runs on the second gpu. Now I'm getting the output as follows without any error:

2019-01-26 21:51:20.126025: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
2019-01-26 21:51:20.380183: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.8
pciBusID: 0000:17:00.0
totalMemory: 7.77GiB freeMemory: 7.62GiB
2019-01-26 21:51:20.380209: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-01-26 21:51:20.637473: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-01-26 21:51:20.637508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 
2019-01-26 21:51:20.637513: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N 
2019-01-26 21:51:20.637668: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7337 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:17:00.0, compute capability: 7.5)
2019-01-26 21:51:20.892308: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 1 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.8
pciBusID: 0000:65:00.0
totalMemory: 7.76GiB freeMemory: 7.47GiB
2019-01-26 21:51:20.892414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1
2019-01-26 21:51:21.142666: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-01-26 21:51:21.142699: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 1 
2019-01-26 21:51:21.142705: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N N 
2019-01-26 21:51:21.142709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1:   N N 
2019-01-26 21:51:21.142932: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7337 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:17:00.0, compute capability: 7.5)
2019-01-26 21:51:21.143217: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 7187 MB memory) -> physical GPU (device: 1, name: GeForce RTX 2080, pci bus id: 0000:65:00.0, compute capability: 7.5)

And the resizing image is done using the following code as I want to run this resizing computation on CPU to get better efficiency

Status LoadImageResizeGraph(std::shared_ptr<tensorflow::Session>* session, const int input_height,
                               const int input_width, const float input_mean,
                               const float input_std) {
  string output_name = "normalized";
  auto root = tensorflow::Scope::NewRootScope();
  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)

  // use a placeholder to read input data
  auto file_reader =
      Placeholder(root.WithOpName("input"), tensorflow::DataType::DT_STRING);

  // Now try to figure out what kind of file it is and decode it.
  const int wanted_channels = 3;
  tensorflow::Output image_reader;
  
    image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
                              DecodeJpeg::Channels(wanted_channels));
  // Now cast the image data to float so we can do normal math on it.
  auto float_caster =
      Cast(root.WithOpName("float_caster"), image_reader, tensorflow::DT_FLOAT);
  // The convention for image ops in TensorFlow is that all images are expected
  // to be in batches, so that they're four-dimensional arrays with indices of
  // [batch, height, width, channel]. Because we only have a single image, we
  // have to add a batch dimension of 1 to the start with ExpandDims().
  auto dims_expander = ExpandDims(root, float_caster, 0);
  // Bilinearly resize the image to fit the required dimensions.
  auto resized = ResizeBilinear(
      root, dims_expander,
      Const(root.WithOpName("size"), {input_height, input_width}));
  // Subtract the mean and divide by the scale.
  Div(root.WithOpName(output_name), Sub(root, resized, {input_mean}),
      {input_std});

  // This runs the GraphDef network definition that we've just constructed, and
  // returns the results in the output tensor.
  tensorflow::GraphDef graph_def;
  TF_RETURN_IF_ERROR(root.ToGraphDef(&graph_def));

  tensorflow::SessionOptions session_options;
  session_options.config.mutable_gpu_options()->set_visible_device_list("0");
  session_options.config.mutable_gpu_options()->set_allow_growth(true);
  session_options.config.set_allow_soft_placement(true);
  session->reset(tensorflow::NewSession(session_options));
  tensorflow::graph::SetDefaultDevice("/cpu:0", &graph_def);
  TF_RETURN_IF_ERROR((*session)->Create(graph_def));
  return Status::OK();
}

I'm using Inception_v3 as specified in the main.cc and want to run inference on 1000 images. Each thread runs 500 images each in parallel for efficient runtime.

But at the end of the program, I'm getting the following error

[8.853 s] Finished running 1000 images successfully
2019-01-26 21:59:10.712562: F tensorflow/stream_executor/cuda/cuda_driver.cc:206] Check failed: CUDA_SUCCESS == cuCtxSetCurrent(cuda_context->context()) (0 vs. 4)
Aborted (core dumped)

I searched the issues history and seems other people are facing the similar problem but no successful resolution has been metioned

https://github.com/tensorflow/tensorflow/issues/3509 https://github.com/tensorflow/tensorflow/issues/10961 https://github.com/tensorflow/tensorflow/issues/526

In the issue #526, one of the owners of the StreamExecutor has mentioned some resolution https://github.com/tensorflow/tensorflow/issues/526#issuecomment-236259722 But it is not working for me.

Can someone please help me to fix this error and use multiple gpus with multiple threads in a single program.

Thank you

Code to reproduce the issue My sample code:


int Initialize(const string& graph_file_name, int i_nthreads) {
  int32 input_width = 299;
  int32 input_height = 299;
  float input_mean = 0;
  float input_std = 255;
  nthreads = i_nthreads;

  // Initialize all sessions;
  for(int i = 0; i < nthreads; i++) {

    int threadid = i;
    if(threadid == 0) {
      LoadGraph(graph_file_name, &global_session_00, 0);
    } else if (threadid == 1) {
      LoadGraph(graph_file_name, &global_session_10, 1);
    }
  }

  LoadImageResizeGraph(&global_session_01, input_height, input_width, input_mean, input_std);
}

void *Perform_Computation_Parallel(void *arg1)
{

  
  string input_layer = "input";
  string output_layer = "InceptionV3/Predictions/Reshape_1";


  int threadid = (int)(((size_t)(arg1)));
  printf("threadid %d\n", threadid);

  std::shared_ptr<tensorflow::Session> session;
  if(threadid == 0) {
    session = global_session_00;
  } else if(threadid == 1) {
    session = global_session_10;
  }

  char *image_path = (char *)malloc(1000000);
  for(int r = threadid; r < global_number_of_images; r += nthreads)
  {
    sprintf(image_path, "%s", global_images[r]);
    printf("%s --------> %d \n", image_path, threadid);

    std::string image_file(image_path);
    std::vector<Tensor> resized_tensors;

    // read file_name into a tensor named input
    Tensor input_image(tensorflow::DT_STRING, tensorflow::TensorShape());
    
    ReadEntireFile(tensorflow::Env::Default(), image_file, &input_image);

    Status run_resize_status = global_session_01->Run({{"input", input_image}},
                                      {"normalized"}, {}, &resized_tensors);

    if (!run_resize_status.ok()) {
      LOG(ERROR) << "Running resize graph failed: " << run_resize_status;
    } 
    // else if(run_resize_status.ok()){
    //   std::cout << "successfully resized image "<< image_file << "\n";
    // }

    // Status read_tensor_status =
    //         ReadTensorFromImageFile(image_file, input_height, input_width, input_mean,
    //                                 input_std, &resized_tensors);
    // if (!read_tensor_status.ok()) {
    //   LOG(ERROR) << read_tensor_status;
    //   //return -1;
    // }
    const Tensor& resized_tensor = resized_tensors[0];


    // Actually run the image through the model.
    std::vector<Tensor> outputs;
    Status run_status = session->Run({{input_layer, resized_tensor}},
                                     {output_layer}, {}, &outputs);
    if (!run_status.ok()) {
      LOG(ERROR) << "Running model failed: " << run_status;
      //return -1;
    }
  }

    
    
}

void Perform_Computation(void)
{
    for(long long int i = 1; i < nthreads; i++) pthread_create(&threads[i], NULL, Perform_Computation_Parallel, (void *)(i));
    Perform_Computation_Parallel(0);
    for(long long int i = 1; i < nthreads; i++) pthread_join(threads[i], NULL);
}

int main(int argc, char* argv[]) {
  //printf("Hello World!!!\n");

  // These are the command-line flags the program can understand.
  // They define where the graph and input data is located, and what kind of
  // input the model expects. If you train your own model, or use something
  // other than inception_v3, then you'll need to update these.
  string image = "./data/grace_hopper.jpg";
  string graph =
      "./data/inception_v3_2016_08_28_frozen.pb";
  string labels =
      "./data/imagenet_slim_labels.txt";
  int32 input_width = 299;
  int32 input_height = 299;
  float input_mean = 0;
  float input_std = 255;
  string input_layer = "input";
  string output_layer = "InceptionV3/Predictions/Reshape_1";
  bool self_test = false;
  string root_dir = "";
  string test_file = "./data/images/test_images_1000.txt";
  string out_file = "./data/results/test_results_1000.txt";

  //nthreads = 2;
  int ret_init_val = Initialize(graph, nthreads);
  std::cout << " Initialization done --------------------------\n";
  FILE *fp = fopen(test_file.c_str(), "r");
  int nol = 0;


  char *line = (char *)malloc(2048);

  while (!feof(fp))
  {
    line[0] = '\0';
    fgets(line, 2048, fp);
    if (line[0] == '\0')  break;
    nol++;
  }
  fclose(fp);

  global_number_of_images = nol;

  global_images = (char **)malloc(global_number_of_images * sizeof(char *));
  

  fp = fopen(test_file.c_str(), "r");
  for(int q = 0; q < global_number_of_images; q++)
  {
    line[0] = '\0';
    fgets(line, 2048, fp);
    line[strlen(line)-1] = '\0';
    global_images[q] = (char *)malloc(strlen(line)+10);
    sprintf(global_images[q], "%s", line);
  }
  fclose(fp);
  double t0 = elapsed();
  Perform_Computation();
  printf ("[%.3f s] Finished runnning %d images successfully\n", elapsed() - t0, nol);
  return 0;
}

closed time in 2 months

tumusudheer

issue commenttensorflow/tensorflow

Check failed: CUDA_SUCCESS == cuCtxSetCurrent(cuda_context->context()) (0 vs. 4)

Closing because of lack of activity. If you want to reopen please first try @chsigg 's suggestion.

tumusudheer

comment created time in 2 months

issue closedtensorflow/tensorflow

SetDefaultDevice doesn't seem to work; Multiple Tensorflow Session on Separate GPU's Cannot Seem to Speed up Inference

I am trying to speed up Tensorflow inference by creating multiple Sessions, with each Session loading its own graph on its own GPU. When I ran this same model for a batch of 10 images on a single GPU, it took about 2700 ms. I was hoping I can run 2 batches, one per GPU and process 20 images in the same time frame. Instead, the run time actually took about 5300 ms. So it seems like I was not able to get the speed up I was hoping for.

I am running Tensorflow 1.7 with 2 Quadro GV100's. I did not get any error messages running my code. Below is my code:

auto options = SessionOptions();
options.config.mutable_gpu_options()->set_visible_device_list("0,1");

NewSession(options, &m_session[0]);
NewSession(options, &m_session[1]);

GraphDef graph_def0;
graph::SetDefaultDevice("/device:GPU:0", &graph_def0);
ReadBinaryProto(Env::Default(), graphPath, &graph_def0);
m_session[0]->Create(graph_def0);

GraphDef graph_def1;
graph::SetDefaultDevice("/device:GPU:1", &graph_def1);
ReadBinaryProto(Env::Default(), graphPath, &graph_def1);
m_session[1]->Create(graph_def1);

//list0 and list1 are list of images, CallSessionRun()'s 2nd arg is index into m_session
std::future<std::vector<std::vector<tf_detection>>> fut0 = std::async([&]()->std::vector<std::vector<tf_detection>>{
	auto detections = CallSessionRun(list0, 0);
	return detections;
});

std::future<std::vector<std::vector<tf_detection>>> fut1 = std::async([&]()->std::vector<std::vector<tf_detection>>{
	auto detections = CallSessionRun(list1, 1);
	return detections;
});

auto ans0 = fut0.get();
auto ans1 = fut1.get();

graph::SetDefaultDevice is supposed to dedicate a GPU for a graph and calling m_session[i]->run() in std::async is supposed to utilize each session concurrently. But it didn't seem to work. Am I missing something?

The fact that the run time stays pretty much the same seems to suggest that graph::SetDefaultDevice() does not work and I am only using 1 GPU instead of 2.

Thank you very much for your help in advance!

closed time in 2 months

GothamCityPro

issue commenttensorflow/tensorflow

SetDefaultDevice doesn't seem to work; Multiple Tensorflow Session on Separate GPU's Cannot Seem to Speed up Inference

Closing because of lack of activity. If you want this reopened please try @aaroey 's advice first and see if that helps.

GothamCityPro

comment created time in 2 months

more