profile
viewpoint
Sanjoy Das sanjoy @google @GoogleCloudPlatform CA http://playingwithpointers.com Compilers and VMs

sanjoy/DietLISP 33

Toy lisp interpreter / PLT playground

sanjoy/CmdArgs 20

Painless command line argument parsing in C.

sanjoy/L 5

A Small Evaluator for Untyped Lambda Calculus

sanjoy/clayoven 1

Modern website generator with a traditional design

sanjoy/eelish 1

n00b programming without locks

sanjoy/impossible-programs 1

http://math.andrej.com/2007/09/28/seemingly-impossible-functional-programs/ in C++

issue commenttensorflow/tensorflow

no kernel image is available for execution on the device

Yup, tf-nightly should work on cards with CC 5.0 (and so would TF 2.4).

aligoglos

comment created time in 3 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 class XlaCompilationCache : public ResourceBase {     std::unique_ptr<xla::LocalExecutable> executable TF_GUARDED_BY(mu);   }; +  Status CompileStrict(+    Entry* entry, const XlaCompiler::Options& options,+    const std::vector<XlaCompiler::Argument>& args,+    const string &function_name,+    const std::function<Status(XlaCompiler* compiler,+                               const std::vector<XlaCompiler::Argument>& args,+                               XlaCompiler::CompilationResult*)>& compile_fn);+  Status CompileAsynchronous(+    Entry* entry, const XlaCompiler::Options& options,+    const std::vector<XlaCompiler::Argument>& args,+    const string &function_name,+    const std::function<Status(XlaCompiler* compiler,+                               const std::vector<XlaCompiler::Argument>& args,+                               XlaCompiler::CompilationResult*)>& compile_fn);

Have you looked into making CompileAsynchronous a separate method / class that lives outside XlaCompilationCache and interacts with it only via its public interface (suitably refactored)? That might simplify the change a bit.

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 class XlaPlatformInfo {   // If the op associated with this XlaPlatformInfo is placed on an XLA device   // then device_allocator_ is the xla::Backend's memory allocator.  If the op   // is placed on a regular CPU or GPU device then device_allocator_ is null.-  se::DeviceMemoryAllocator* device_allocator_;+  // The allocator is of unknowm provenance; keep it in a shared pointer to

s/unknowm/unknown/

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 class XlaCompilationCache : public ResourceBase {   absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_       TF_GUARDED_BY(cluster_compile_stats_mu_); +  struct AsyncCompilation {+    mutex async_compilation_mu_;++    // Number of threads for asynchronous compilations.+    static constexpr int64 kNrofCompilerThreads = 10;++    // Maximum number of ongoing compilations.+    static constexpr int64 kMaxNrofOngoingCompilations = kNrofCompilerThreads;++    // Pool of threads for asynchronous compilations.+    thread::ThreadPool compiler_threads;++    // Number of ongoing compilations.+    int64 nrof_ongoing_compilations GUARDED_BY(async_compilation_mu_) = 0;++    AsyncCompilation()+      : compiler_threads(tensorflow::Env::Default(), "aync_compiler_threads",+                         kNrofCompilerThreads) {}+    ~AsyncCompilation() {}++  } async_compilation_;+   // The number of times a lazy compilation must be requested for a specific   // signature before  we attempt to compile it.-  static constexpr int64 kDefaultCompilationThreshold = 2;+  static constexpr int64 kDefaultCompilationThreshold = 3;

This is a separate change right?

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 class XlaCompilationCache : public ResourceBase {   absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_       TF_GUARDED_BY(cluster_compile_stats_mu_); +  struct AsyncCompilation {+    mutex async_compilation_mu_;++    // Number of threads for asynchronous compilations.+    static constexpr int64 kNrofCompilerThreads = 10;++    // Maximum number of ongoing compilations.+    static constexpr int64 kMaxNrofOngoingCompilations = kNrofCompilerThreads;++    // Pool of threads for asynchronous compilations.+    thread::ThreadPool compiler_threads;++    // Number of ongoing compilations.+    int64 nrof_ongoing_compilations GUARDED_BY(async_compilation_mu_) = 0;++    AsyncCompilation()+      : compiler_threads(tensorflow::Env::Default(), "aync_compiler_threads",+                         kNrofCompilerThreads) {}+    ~AsyncCompilation() {}

Is this needed?

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 class XlaCompilationCache : public ResourceBase {   absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_       TF_GUARDED_BY(cluster_compile_stats_mu_); +  struct AsyncCompilation {+    mutex async_compilation_mu_;++    // Number of threads for asynchronous compilations.+    static constexpr int64 kNrofCompilerThreads = 10;++    // Maximum number of ongoing compilations.+    static constexpr int64 kMaxNrofOngoingCompilations = kNrofCompilerThreads;++    // Pool of threads for asynchronous compilations.+    thread::ThreadPool compiler_threads;

I'm somewhat hesitant about this since this will contend with real computation happening on the CPU. Is there a way to use TF's own thread pool (tensorflow_device_thread_pool)?

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 class XlaCompilationCache : public ResourceBase {   absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_       TF_GUARDED_BY(cluster_compile_stats_mu_); +  struct AsyncCompilation {+    mutex async_compilation_mu_;++    // Number of threads for asynchronous compilations.+    static constexpr int64 kNrofCompilerThreads = 10;

How about kNumCompilerThreads?

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 void LogOnceXlaCompiledFirstCluster() { } }  // namespace +Status XlaCompilationCache::CompileStrict(+    Entry* entry, const XlaCompiler::Options& options,+    const std::vector<XlaCompiler::Argument>& args,+    const string &function_name,+    const std::function<Status(XlaCompiler* compiler,+                               const std::vector<XlaCompiler::Argument>& args,+                               XlaCompiler::CompilationResult*)>& compile_fn) {+  tensorflow::Env* env = tensorflow::Env::Default();+  const uint64 compile_start_us = env->NowMicros();++  XlaCompiler compiler(options);+  entry->compile_state = CompileState::kCompiled;++  entry->compilation_status =+      compile_fn(&compiler, args, &entry->compilation_result);+  TF_RETURN_IF_ERROR(entry->compilation_status);+  CHECK_EQ(entry->executable.get(), nullptr);+  entry->compilation_status =+      BuildExecutable(options, entry->compilation_result, &entry->executable);++  const uint64 compile_end_us = env->NowMicros();+  const uint64 compile_time_us = compile_end_us - compile_start_us;+  metrics::UpdateXlaCompilationTime(compile_time_us);+  {+    mutex_lock lock(cluster_compile_stats_mu_);+    auto it = cluster_compile_stats_.find(function_name);+    const uint64 compile_time_s = compile_time_us / 1.0e6;+    it->second.compile_count++;+    it->second.cumulative_compile_time_us += compile_time_us;+    it->second.max_compile_time_s = std::max(it->second.max_compile_time_s,+                                             compile_time_s);+    LogOnceXlaCompiledFirstCluster();+    VLOG(1) << "compiled " << function_name << " "+            << it->second.compile_count+            << " times, compile time: " << compile_time_us+            << " us, cumulative: " << it->second.cumulative_compile_time_us+            << " us ("+            << tensorflow::strings::HumanReadableElapsedTime(compile_time_s)+            << " / "+            << tensorflow::strings::HumanReadableElapsedTime(+                   it->second.cumulative_compile_time_us / 1.0e6)+            << ")";++    XlaJitCompilationActivity jit_compilation_activity;+    jit_compilation_activity.set_cluster_name(function_name);+    jit_compilation_activity.set_compile_count(it->second.compile_count);+    jit_compilation_activity.set_compile_time_us(compile_time_us);+    jit_compilation_activity.set_cumulative_compile_time_us(+        it->second.cumulative_compile_time_us);+    TF_RETURN_IF_ERROR(+        BroadcastXlaActivity(std::move(jit_compilation_activity)));+  }++  return Status::OK();+}++Status XlaCompilationCache::CompileAsynchronous(+    Entry* entry, const XlaCompiler::Options& options,+    const std::vector<XlaCompiler::Argument>& args,+    const string &function_name,+    const std::function<Status(XlaCompiler* compiler,+                               const std::vector<XlaCompiler::Argument>& args,+                               XlaCompiler::CompilationResult*)>& compile_fn) {+  entry->compile_state = CompileState::kCompiling; // Still under caller's lock.+  {+    mutex_lock lock(async_compilation_.async_compilation_mu_);+    async_compilation_.nrof_ongoing_compilations++;+  }+  // Don't move the above code into the thread function!!!++  // Passing options by value into the lamba increases the refcount on+  // options.device_allocator, keeping it alive for the duration of the+  // compilation.+  // Passing args by value as well. Doing this here only when an asynchronous+  // compilation is performed, as copying many argS incurs an overhead.+  async_compilation_.compiler_threads.Schedule([=] {+      Entry tmp;

Please use a more descriptive name.

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 class XlaCompilationCache : public ResourceBase {   absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_       TF_GUARDED_BY(cluster_compile_stats_mu_); +  struct AsyncCompilation {+    mutex async_compilation_mu_;++    // Number of threads for asynchronous compilations.+    static constexpr int64 kNrofCompilerThreads = 10;++    // Maximum number of ongoing compilations.+    static constexpr int64 kMaxNrofOngoingCompilations = kNrofCompilerThreads;++    // Pool of threads for asynchronous compilations.+    thread::ThreadPool compiler_threads;++    // Number of ongoing compilations.+    int64 nrof_ongoing_compilations GUARDED_BY(async_compilation_mu_) = 0;

Can we call this num_ongoing_compilations?

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 void LogOnceXlaCompiledFirstCluster() { } }  // namespace +Status XlaCompilationCache::CompileStrict(+    Entry* entry, const XlaCompiler::Options& options,+    const std::vector<XlaCompiler::Argument>& args,+    const string &function_name,+    const std::function<Status(XlaCompiler* compiler,+                               const std::vector<XlaCompiler::Argument>& args,+                               XlaCompiler::CompilationResult*)>& compile_fn) {+  tensorflow::Env* env = tensorflow::Env::Default();+  const uint64 compile_start_us = env->NowMicros();++  XlaCompiler compiler(options);+  entry->compile_state = CompileState::kCompiled;++  entry->compilation_status =+      compile_fn(&compiler, args, &entry->compilation_result);+  TF_RETURN_IF_ERROR(entry->compilation_status);+  CHECK_EQ(entry->executable.get(), nullptr);+  entry->compilation_status =+      BuildExecutable(options, entry->compilation_result, &entry->executable);++  const uint64 compile_end_us = env->NowMicros();+  const uint64 compile_time_us = compile_end_us - compile_start_us;+  metrics::UpdateXlaCompilationTime(compile_time_us);+  {+    mutex_lock lock(cluster_compile_stats_mu_);+    auto it = cluster_compile_stats_.find(function_name);+    const uint64 compile_time_s = compile_time_us / 1.0e6;+    it->second.compile_count++;+    it->second.cumulative_compile_time_us += compile_time_us;+    it->second.max_compile_time_s = std::max(it->second.max_compile_time_s,+                                             compile_time_s);+    LogOnceXlaCompiledFirstCluster();+    VLOG(1) << "compiled " << function_name << " "+            << it->second.compile_count+            << " times, compile time: " << compile_time_us+            << " us, cumulative: " << it->second.cumulative_compile_time_us+            << " us ("+            << tensorflow::strings::HumanReadableElapsedTime(compile_time_s)+            << " / "+            << tensorflow::strings::HumanReadableElapsedTime(+                   it->second.cumulative_compile_time_us / 1.0e6)+            << ")";++    XlaJitCompilationActivity jit_compilation_activity;+    jit_compilation_activity.set_cluster_name(function_name);+    jit_compilation_activity.set_compile_count(it->second.compile_count);+    jit_compilation_activity.set_compile_time_us(compile_time_us);+    jit_compilation_activity.set_cumulative_compile_time_us(+        it->second.cumulative_compile_time_us);+    TF_RETURN_IF_ERROR(+        BroadcastXlaActivity(std::move(jit_compilation_activity)));+  }++  return Status::OK();+}++Status XlaCompilationCache::CompileAsynchronous(+    Entry* entry, const XlaCompiler::Options& options,+    const std::vector<XlaCompiler::Argument>& args,+    const string &function_name,+    const std::function<Status(XlaCompiler* compiler,+                               const std::vector<XlaCompiler::Argument>& args,+                               XlaCompiler::CompilationResult*)>& compile_fn) {+  entry->compile_state = CompileState::kCompiling; // Still under caller's lock.+  {+    mutex_lock lock(async_compilation_.async_compilation_mu_);+    async_compilation_.nrof_ongoing_compilations++;+  }+  // Don't move the above code into the thread function!!!++  // Passing options by value into the lamba increases the refcount on+  // options.device_allocator, keeping it alive for the duration of the+  // compilation.+  // Passing args by value as well. Doing this here only when an asynchronous+  // compilation is performed, as copying many argS incurs an overhead.+  async_compilation_.compiler_threads.Schedule([=] {+      Entry tmp;+      VLOG(2) << "Starting asynchronous compilation of cluster "+              << function_name << '.';+      (void)CompileStrict(&tmp, options, args, function_name, compile_fn);

I don't think we can ignore the error, we need to report it back to the user. IMO the right solution is to store the Status in entry.

bas-aarts

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

XLA Asynchronous compilation

 constexpr int64 XlaCompilationCache::kDefaultCompilationThreshold;  XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,                                          DeviceType device_type)-    : client_(client), device_type_(std::move(device_type)) {}+    : client_(client), device_type_(std::move(device_type)), async_compilation_() {}

Is the , async_compilation_() needed?

bas-aarts

comment created time in 6 days

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

issue commenttensorflow/tensorflow

Support "fetch_skip_sync=false" in Callable

I don't think you need the CUDA streams being used to call Device::Sync, if you're comfortable writing C++ you should be able to get the set of Devices by DeviceFactory::AddDevices and then call Device::Sync on the correct device yourself.

GuanLuo

comment created time in 7 days

issue commenttensorflow/serving

GPU docker 2.2.0 takes too long to initialize

Hi all,

TF serving 2.2.0 used a slightly different build configuration that caused it to JIT PTX at startup for newer GPUs. This issue has been fixed in TF serving 2.3 and you should not be seeing the same kind of startup delay. Given https://github.com/tensorflow/serving/issues/1663#issuecomment-683384880, it looks like the fix is working as intended, so we should close this issue.

@christisg I can't close this issue so assigning to you for closing.

HuiCheng

comment created time in 7 days

PullRequestReviewEvent

issue commenttensorflow/tensorflow

Bug: tensorflow-gpu takes long time before beginning to compute

@RayerXie what version of TF are you using? tf-nightly should not need to block for JIT compilation on compute capability 6.1.

kettenfett

comment created time in 13 days

Pull request review commenttensorflow/tensorflow

Fix for a deadlock in XlaDeviceContext

 class XlaDeviceContext : public DeviceContext {   // Returns a device-to-device stream, in round-robin fashion.   se::Stream* GetDeviceToDeviceStream(); +  virtual Status ThenExecute(Device* device, stream_executor::Stream* stream,

You don't also need a virtual here.

ekuznetsov139

comment created time in 17 days

PullRequestReviewEvent
PullRequestReviewEvent

issue commenttensorflow/tensorflow

GPU devices not detected in tf-nightly in latest releases (Google Colaboratory)

2.4.0-dev20200819 is the first release with CUDA 11, so it is very likely that that's the root issue. Colab needs to be adjusted to use CUDA 11 instead of CUDA 10.1.

drajsel

comment created time in 19 days

issue commenttensorflow/tensorflow

Bigger Than Memory ops should automatically fallback to RAM and/or Disc in tf-gpu

Assigned to Gaurav since it looks like the core issue is https://github.com/tensorflow/tensorflow/issues/29840#issuecomment-507066169?

bionicles

comment created time in 20 days

issue commenttensorflow/tensorflow

Is SYCL on Tensorflow officially dead?

Hi @rodburns,

Sorry for the delayed response.

Generally we're moving away from putting diverse HW backends integrated into the mainline. Instead we want them to live out of tree for obvious maintainability benefits -- the TensorFlow team can only own so much code. Moreover, this means you retain code ownership over the TF/SYCL integration (since it would no longer live in the TF repository), and don't need to go through Google's code review process to make changes.

This ability to have "out of tree backends" is a core design principle of TFRT (CC @mhong) and is also the motivation behind RFC's like https://github.com/tensorflow/community/pull/262 and https://github.com/tensorflow/community/pull/257 (CC @annarev).

So how about the following:

  • Let's drop SYCL support from the TF core repository. I'll do this in the next week or so unless you have strong objections.
  • Let's work together on making sure the pluggable device API and/or TFRT works for SYCL. IMO this will be a better use of eng resources on your side, as compared to upstreaming SYCL support to TF HEAD.
kevint324

comment created time in 22 days

issue commenttensorflow/tensorflow

Docker with GPU 2.3rc0 CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid

@mihaimaruseac Could @motrek's issue indicate a bug in how we build libtensorflow.so? I don't see the CUDA capabilities being configured here.

CC @angerson

jcrousse

comment created time in 25 days

issue commenttensorflow/tensorflow

CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid?

@unrealwill if you're running on linux you can use TF docker images to avoid having to deal with CUDA and cuDNN versions.

abhipn

comment created time in 25 days

PR opened tensorflow/docs

Reviewers
Clean up the table in docker.md

The rendering is broken today for the last row on the website. On github preview the other rows are rendered incorrectly as well.

+7 -10

0 comment

1 changed file

pr created time in 25 days

push eventsanjoy/docs

Sanjoy Das

commit sha 62df7eed343add713f48db1e242557139912c5f3

Clean up the table in docker.md The rendering is broken today for the last row.

view details

push time in 25 days

issue commenttensorflow/tensorflow

CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid?

Hi everyone,

TF nightly starting from the latest nightly build should now work on GPUs with compute capability 5.0 like GeForce GTX 960M. Please give it a try and let us know how it goes.

abhipn

comment created time in 25 days

issue commenttensorflow/tensorflow

Docker with GPU 2.3rc0 CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid

@motrek Actually I don't really understand of what is going on in your case (my previous message was mainly for folks running on GPUs with compute capability 5.0) since, as you've observed above, TF should work fine on compute capability 7.5.

Btw, how are you using the docker images? If you're using the ones listed at https://www.tensorflow.org/install/docker then you should only need a recent enough GPU driver on your host system.

jcrousse

comment created time in 25 days

issue commenttensorflow/tensorflow

Docker with GPU 2.3rc0 CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid

Hi everyone,

TF nightly starting from the latest nightly build as of now should include SASS for compute capability 5.0, so it should work on GPUs like GeForce GTX 960M. Please give it a try and let us know how it goes!

jcrousse

comment created time in 25 days

issue commenttensorflow/tensorflow

Docker with GPU 2.3rc0 CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid

@motrek TF 2.3 should indeed work on compute capability 7.5. However this suggests that it lacks tensor cores, it is possible that's why you get the error.

jcrousse

comment created time in a month

pull request commenttensorflow/tensorflow

XLA Parallel reduce

CC @timshen91

trentlo

comment created time in a month

PullRequestReviewEvent

issue commenttensorflow/tensorflow

AOT compiled graph is 2-7x slower than Python

the compiled graph shouldn't change as invL is constant and can be optimized away. Unfortunately this does not happen with modelA1.

I didn't quite follow this -- isn't invL dependent on L which is a runtime variable value (i.e. not a constant)?

battuzz

comment created time in a month

issue commenttensorflow/tensorflow

from_dlpack unable to process arrays with column-major strides

Would a warning be a potential middle ground?

WDYT about adding a allow_noncanonical_layouts boolean argument to tf.experimental.dlpack.from_dlpack? If this arg is set then we do a transpose, otherwise we don't. That way it is obvious at the call site if a transpose may happen.

alecgunny

comment created time in a month

issue commenttensorflow/tensorflow

from_dlpack unable to process arrays with column-major strides

CC @VoVAllen

I'm slightly worried that this might break the (implicit) promise that transferring tensors via dlpack is "free", but I'm happy to be corrected.

alecgunny

comment created time in a month

PullRequestReviewEvent
PullRequestReviewEvent
PullRequestReviewEvent

issue commenttensorflow/tensorflow

from_dlpack unable to process arrays with column-major strides

I don't think this can be fixed easily -- tensorflow::Tensor does not have an independent layout that can be set. We can, however, do an in-memory transpose, but it won't be free.

alecgunny

comment created time in a month

issue commenttensorflow/tensorflow

Unresolved symbol EigenMatMulF64 when linking a compiled graph with XLA AOT runtime

@sanjoy Do we have any existing saved_models containing large variables/tensors in matmul?

Not sure if we have such models checked in. Can we just use the test case in this issue?

battuzz

comment created time in a month

PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commenttensorflow/tensorflow

Fix DynamicPartitionOpGPU when running on multiple GPUs

 tf_kernel_library(         ":gather_functor",         ":gpu_prim_hdrs",         "//tensorflow/core:framework_internal",+        "//tensorflow/core:gpu_runtime",

Is this necessary? check_numerics_op does not need this.

drebain

comment created time in a month

PullRequestReviewEvent
PullRequestReviewEvent

issue commenttensorflow/tensorflow

tf.dynamic_partition causes crash when using multiple GPUs via tf.distribute.MirroredStrategy

Hi @drebain,

Your fix looks correct to me; would you be willing to create a PR fixing the bug and adding the test? You can use collective_nccl_test as an example on how to add a multi-GPU test to TensorFlow.

You'll need to switch between ROCm and CUDA though, like we do here.

drebain

comment created time in a month

pull request commenttensorflow/tensorflow

NVCC bug demonstration

Brian, does this still crash the GPU?

brianwa84

comment created time in a month

PullRequestReviewEvent

issue commenttensorflow/tensorflow

Cannot initialize variables on GPU with keras?

What is the type of the variable? ResourceApplyAdam has GPU kernels for floating point types.

@jaingaurav should we be logging the dtypes from the error message above?

ziofil

comment created time in a month

issue commenttensorflow/tensorflow

AOT compiled graph is 2-7x slower than Python

Is there the possibility that something else is going on?

Currently XLA CPU lowers TriangularSolve to a fairly loop of HLO instructions. It is slightly surprising but entirely possible that a naively written C++ implementation will be faster. Would you be willing to contribute a C++ implementation to XLA?

@hawkinsp does JAX have an optimized custom-call implementation for triangular-solve?

battuzz

comment created time in a month

issue commenttensorflow/tensorflow

Unresolved symbol EigenMatMulF64 when linking a compiled graph with XLA AOT runtime

@d0k any ideas why this ^ would happen?

battuzz

comment created time in a month

issue commenttensorflow/tensorflow

AOT compiled graph is 2-7x slower than Python

Hi @battuzz,

I believe this issue reported two issues:

  • TriangularSolve is much slower than expected. This has nothing to with multi-threading, XLA CPU just does not have an optimized triangular solve implementation today.
  • Matrix multiply is much slower than expected. In this case I believe the Python version is using multiple threads while XLA isn't. XLA does support multi-threaded matrix multiplies, but that isn't exposed via the saved_model_cli interface. @ebrevdo I was not able to see how tfcompile depends on nsync.
battuzz

comment created time in a month

PullRequestReviewEvent

issue commenttensorflow/tensorflow

AOT compiled graph is 2-7x slower than Python

This isn't too surprising for triangular solve since XLA has fairly naive implementation.

For GEMM this is quite surprising. Maybe the AOT compiled GEMM is using only one thread? Can you try passing --enable_multithreading to saved_model_cli aot_compile_cpu?

CC @ebrevdo

battuzz

comment created time in a month

issue commenttensorflow/tensorflow

Unresolved symbol EigenMatMulF64 when linking a compiled graph with XLA AOT runtime

@ebrevdo Any idea what's going on? __xla_cpu_runtime_EigenMatMulF64 is defined in compiler/xla/service/cpu/runtime_matmul.cc, maybe saved_model_cli is not picking it up for some reason?

battuzz

comment created time in a month

issue commenttensorflow/tensorflow

no kernel image is available for execution on the device

TF 2.3 includes PTX kernels only for compute capability 7.0 to reduce the TF pip binary size. Earlier releases included PTX for a variety of older compute capabilities.

Yes. See here.

aligoglos

comment created time in a month

PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commenttensorflow/tensorflow

[XLA] When LLVM doesn't know a CC that is more recent, warn only to developers.

 static string GetSmName(std::pair<int, int> compute_capability) {   int sm_version = 30;   // If the current compute capability isn't known, fallback to the   // most recent version before it.-  for (int v : {75, 72, 70, 62, 61, 60, 53, 52, 50, 37, 35, 32, 30}) {+  auto supported_version = {75, 72, 70, 62, 61, 60, 53, 52, 50, 37, 35, 32, 30};

supported_versions

nouiz

comment created time in a month

PullRequestReviewEvent
PullRequestReviewEvent

Pull request review commenttensorflow/tensorflow

Fix tests when TensorFloat-32 is enabled

 def test_regularizer_loss(self, distribution):       self.assertEqual(-1.0, v)  +@testing_utils.run_all_without_tensor_float_32(+    'Uses Dense layers, which call matmul')

Can we adjust the tolerances instead? 3e-6 seems like a reasonable tolerance.

reedwm

comment created time in a month

Pull request review commenttensorflow/tensorflow

Fix tests when TensorFloat-32 is enabled

 from tensorflow.python.saved_model import saved_model  +@testing_utils.run_all_without_tensor_float_32(+    'Uses Dense layers, which call matmul')

We should adjust the tolerance for this test as well instead of disabling it for TF32.

reedwm

comment created time in a month

Pull request review commenttensorflow/tensorflow

Fix tests when TensorFloat-32 is enabled

 import numpy as np  from tensorflow.compiler.tests import xla_test+from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test  +@test_util.run_all_without_tensor_float_32(+    "It's unknown why this test requires TF32 to be disabled")+# TODO(reedwm): Determine why this test requires TF32 disabled. Debugging is

Probably file a bug or GH issue (unless you intend to look at this immediately after this PR goes in)?

reedwm

comment created time in a month

issue commenttensorflow/tensorflow

CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid?

Sorry I missed this : [Default is: 3.5,7.0]. Yes in that case we would have to recompile.

That is the default for ./configure.py but the pip package we ship supports 3.5, 3.7, 5.2, 6.0, 6.1, 7.0 and higher than 7.0. I will update https://www.tensorflow.org/install/gpu to be clearer about this.

abhipn

comment created time in a month

issue commenttensorflow/tensorflow

CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid?

I have a Quadro M1200 which has compute capability 1200 and I am still getting this same error.

Quadro M1200 also has compute capability 5.0.

abhipn

comment created time in a month

issue commenttensorflow/tensorflow

Issue with CUDA and cuDNN

Yes I am definitely not seeing max gpu utilization. I tried the code nothing changed.

Can you try using the TensorBoard profiler to see if something sticks out?

meric-sakarya

comment created time in a month

push eventsanjoy/tensorflow

Sanjoy Das

commit sha b2dac8b4e0ca545fc97f07c5418367a2cd8b1758

fix typo

view details

push time in a month

push eventsanjoy/tensorflow

Sanjoy Das

commit sha c14dc2abd7d5455b9e2d8303f028deea7349f2e8

fix typo

view details

push time in a month

push eventsanjoy/tensorflow

Sanjoy Das

commit sha 96c062cdb68ae66e89879c059a6a8a4c67a263d0

fix typo

view details

push time in a month

push eventsanjoy/tensorflow

Sanjoy Das

commit sha 1af1e4b50bf091e3f6d918bc54e9f51fb5d472e9

fix typo

view details

push time in a month

push eventsanjoy/tensorflow

Sanjoy Das

commit sha f68214030adb00e47ed0bfb54e5ba92a81f8f4d1

add headers to srcs

view details

push time in a month

push eventsanjoy/tensorflow

Sanjoy Das

commit sha 4ae75c7de33db77c10a39c50fe05b226caea8aa1

Revert "wip" This reverts commit 5d831d790098fcfd9983381e13f62a63eb91c50c.

view details

Sanjoy Das

commit sha 586018ee3f1cbc6a73592274818af51c26200eb4

Revert "comment" This reverts commit 06cb7e6d61032a2a6865c88fc1bf8a365d110a3b.

view details

push time in a month

Pull request review commenttensorflow/tensorflow

Fix tests when TensorFloat-32 is enabled

 def test_regularizer_loss(self, distribution):       self.assertEqual(-1.0, v)  +@testing_utils.run_all_without_tensor_float_32(+    'Uses Dense layers, which call matmul')

This seems scary -- ideally we should be able to run dense layers in TF32 without noticing right?

How much would we have to adjust the tolerances here to have this pass?

reedwm

comment created time in a month

Pull request review commenttensorflow/tensorflow

Fix tests when TensorFloat-32 is enabled

 def test_session(self):         'test_session not supported on XLATestCase, please use session')    @contextlib.contextmanager-  def test_scope(self):-    """Test scope that runs tests on `self.device`.+  def device_scope(self):+    """Scope that runs tests on `self.device`.      Yields:       A scope to apply to the operators under test.     """     with ops.device('device:{}:0'.format(self.device)):       yield +  def test_scope(self):+    """Deprecated alias of `device_scope`.++    This should be avoided as the name starts with `test`, so test runners+    treat it as a test. This interferes with class decorators that operate on+    each test method.+    """+    return self.device_scope()++

Nice, this looks like something we can separate and land (or will that be too inconvenient)?

reedwm

comment created time in a month

Pull request review commenttensorflow/tensorflow

Fix tests when TensorFloat-32 is enabled

 namespace { using QrTest = xla::ClientLibraryTestBase;  XLA_TEST_F(QrTest, Simple) {+  tensorflow::allow_tf32_execution(false);

Let's leave some breadcrumbs here on what's going on, maybe wrap allow_tf32_execution into a function that has a comment explaining what's going on.

reedwm

comment created time in a month

push eventsanjoy/tensorflow

Sanjoy Das

commit sha 5d831d790098fcfd9983381e13f62a63eb91c50c

wip

view details

push time in a month

Pull request review commenttensorflow/tensorflow

Enabling a debug dll build under Windows (without CUDA, at least)

 class DepthToSpaceOp : public OpKernel {     auto Tinput = input.tensor<T, kDims>();     auto Toutput = outputs_tensor->tensor<T, kDims>(); -    if (std::is_same<Device, GPUDevice>::value) {-      if (is_int8x4) {-        // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.-        auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();-        auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();-        functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;-        functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,-                Toutput_v);-        return;-      } else if (data_format_ == FORMAT_NCHW) {-        functor::DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> functor;-        functor(context->eigen_device<GPUDevice>(), Tinput, block_size_,-                Toutput);-        return;-      }-    }-     // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected     // (CPU && data_format_ != FORMAT_NHWC) in the constructor. 

I mean something like:

template <class Device>
struct DepthToSpaceFunctorWrapper {
  void operator(params) {
    // .. generic implementation
  }
};

template <>
struct DepthToSpaceFunctorWrapper<GPUDevice> {
  void operator(params) {
    // .. GPU implementation
  }
};

void Compute(...) {
  // Common stuff ..
  auto Toutput = outputs_tensor->tensor<T, kDims>();
  DepthToSpaceFunctorWrapper<Device>{}(params as needed);
}
MikhailStartsev

comment created time in a month

Pull request review commenttensorflow/tensorflow

Enabling a debug dll build under Windows (without CUDA, at least)

 class DepthToSpaceOp : public OpKernel {     auto Tinput = input.tensor<T, kDims>();     auto Toutput = outputs_tensor->tensor<T, kDims>(); -    if (std::is_same<Device, GPUDevice>::value) {-      if (is_int8x4) {-        // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.-        auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();-        auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();-        functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;-        functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,-                Toutput_v);-        return;-      } else if (data_format_ == FORMAT_NCHW) {-        functor::DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> functor;-        functor(context->eigen_device<GPUDevice>(), Tinput, block_size_,-                Toutput);-        return;-      }-    }-     // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected     // (CPU && data_format_ != FORMAT_NHWC) in the constructor. 

Can we specialize just this part that's different between GPU and CPU? Same comment for SpaceToDepthOp.

MikhailStartsev

comment created time in a month

Pull request review commenttensorflow/tensorflow

Enabling a debug dll build under Windows (without CUDA, at least)

 class DepthToSpaceOp : public OpKernel {   TensorFormat data_format_; }; +// Template specialization for GPUDevice, explicit referncing GPUDevice in code

End comment with period, also typo in referncing

MikhailStartsev

comment created time in a month

issue commenttensorflow/tensorflow

tf.keras.Sequential() fails

Why would this work in Jupyter Notebooks or Colab but not in PyCharm?

I'm not familiar with PyCharm so unfortunately I cannot help with this.

alexn11

comment created time in a month

issue commenttensorflow/tensorflow

Reduction is average step time when disabling certain HLO passes

Tim, does this mean we need to adjust some of our heuristics in the passes listed?

CC @thomasjoerg @akuegel

mmadala95

comment created time in 2 months

issue commenttensorflow/tensorflow

tf.keras.Sequential() fails

This is the failing assertion which seems to be GPU independent. Do you know if you're doing something weird in your model?

from tensorflow.keras.models import Sequential
model = Sequential()

seems to work fine for me though, on tf-nightly.

alexn11

comment created time in 2 months

issue commenttensorflow/tensorflow

TF with CUDA 11 and cuDNN 8

the recent Nvidia update upended my system

@summa-code I recommend using docker if that works for your use case.

pkanwar23

comment created time in 2 months

Pull request review commenttensorflow/tensorflow

[TF2XLA] Add EuclideanNorm kernel

 void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {   xla::PrimitiveType type;   TF_CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type)); -  auto data = xla::ConvertElementType(ctx->Input(0), type);+  auto input = xla::ConvertElementType(ctx->Input(0), type);+  auto converted_input = PreprocessInput(b, input);

Sorry for the miscommunication, I meant:

  auto converted_input = xla::ConvertElementType(ctx->Input(0), type);
  auto preprocessed_input = PreprocessInput(b, converted_input);
WindQAQ

comment created time in 2 months

Pull request review commenttensorflow/tensorflow

[TF2XLA] Add EuclideanNorm kernel

 void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {   xla::XlaComputation reduction_computation = r.Build().ConsumeValueOrDie();    auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes);-  auto finalized = BuildFinalizer(b, data, reduce, xla_axes);+  auto finalized = BuildFinalizer(b, ctx->Input(0), reduce, xla_axes);

LGTM, except that I'd prefer s/input/converted_input/

WindQAQ

comment created time in 2 months

Pull request review commenttensorflow/tensorflow

[TF2XLA] Add EuclideanNorm kernel

 void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {   xla::XlaComputation reduction_computation = r.Build().ConsumeValueOrDie();    auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes);-  auto finalized = BuildFinalizer(b, data, reduce, xla_axes);+  auto finalized = BuildFinalizer(b, ctx->Input(0), reduce, xla_axes);

Why did this need to change? Earlier we were passing in the result of ConvertElementType right?

WindQAQ

comment created time in 2 months

pull request commenttensorflow/tensorflow

Fix #41630: include max_seq_length in cudnn descriptor cache key

Can we hope for a 1.15.4 to include that fix ?

CC @goldiegadde

lissyx

comment created time in 2 months

issue commenttensorflow/tensorflow

CUDA runtime implicit initialization on GPU:0 failed. Status: device kernel image is invalid?

Hi @abhipn,

You can enter the compute capability when the configure script says:

Please note that each additional compute capability significantly increases your build time and binary size, and that TensorFlow only supports compute capabilities >= 3.5 [Default is: 3.5,7.0]: (Enter 5.0, and possibly others if needed.)

As for compile time, I don't think it will build quickly (although targeting just 5.0 should cut down on the compile time); I recommend building it on a beefy GCP VM for a quicker turnaround.

abhipn

comment created time in 2 months

issue commenttensorflow/tensorflow

GPU memory issue with triton server 20.03

I just only run or load with warm-up one model and check gpu memory usage by nvidia-smi.

Does Triton run the inference models on different streams? Or does it run them on a single stream? If the latter then this is not surprising -- the TF memory allocator must be reusing memory allocated and freed by the first model for the second model.

loveppdog

comment created time in 2 months

Pull request review commenttensorflow/tensorflow

[TF2XLA] Add EuclideanNorm kernel

 XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,       ctx, DataTypeToPrimitiveType(reduction_type_, &xla_reduction_type_)); } +// The default pre-processor directly returns the data. This can be overridden.+xla::XlaOp XlaReductionOp::ProcessInput(xla::XlaBuilder* /*builder*/,

The verb here needs to be more specific than Process. Preprocess is fine.

WindQAQ

comment created time in 2 months

pull request commenttensorflow/tensorflow

Re-enable GPUNMSv3 and v4 for small box count users

CC @pkanwar23

samikama

comment created time in 2 months

PR closed tensorflow/tensorflow

Reviewers
Re-enable GPUNMSv3 and v4 for small box count users cla: yes size:S stat:awaiting tensorflower

Reopening #34331 for master, to be CP'ed to 2.1. @mihaimaruseac @aaroey

This PR enables GPU implementation for NonMaxSuppresion ops v3 and v4 which was disabled due to high memory utilization at extremely large input counts (~8GB for 256000 boxes). Op has modest memory requirements for most models having about O(10k) inputs.

+14 -14

12 comments

1 changed file

samikama

pr closed time in 2 months

pull request commenttensorflow/tensorflow

Re-enable GPUNMSv3 and v4 for small box count users

@sanjoy, Any update on this PR? Please. Thanks!

These kernels are now enabled at head and so this PR can be closed.

samikama

comment created time in 2 months

issue commenttensorflow/tensorflow

Problems with Transformations when trying to model.fit() my 2D CNN Model, leading to two exceptions

Sadly, I don't quite know how to export any logging

By "logging" I meant everything TensorFlow prints to stderr and stdout during execution. That output might have a hint on what's going wrong with cuDNN.

zhilothebest

comment created time in 2 months

issue commenttensorflow/tensorflow

CUDA_ERROR_ILLEGAL_ADDRESS in toy training example

@rmlarsen here cuSolverDN is failing with CUSOLVER_STATUS_EXECUTION_FAILED:

...
W tensorflow/core/framework/op_kernel.cc:1773] OP_REQUIRES failed at determinant_op.cc:228 : Internal: tensorflow/core/util/cuda_solvers.cc:466: cuSolverDN call failed with status =6
W tensorflow/core/framework/op_kernel.cc:1773] OP_REQUIRES failed at determinant_op.cc:228 : Internal: tensorflow/core/util/cuda_solvers.cc:466: cuSolverDN call failed with status =6
E tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:725] failed to record completion event; therefore, failed to create inter-stream dependency
I tensorflow/stream_executor/stream.cc:4952] [stream=0x402c0130,impl=0x402be810] did not memcpy host-to-device; source: 0x7f5089649fc0
E tensorflow/stream_executor/stream.cc:338] Error recording event in stream: Error recording CUDA event: CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory access was encountered; not marking stream as bad, as the Event object may be at fault. Monitor for further errors.
E tensorflow/stream_executor/cuda/cuda_event.cc:29] Error polling for event status: failed to query event: CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory access was encountered
F tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc:220] Unexpected Event status: 1

Any idea how to debug this further? Is it a user error? If so, probably TF should be returning a friendlier error message.

fachu000

comment created time in 2 months

Pull request review commenttensorflow/tensorflow

[TF2XLA] Add EuclideanNorm kernel

 class AnyOp : public XlaReductionOp { REGISTER_XLA_OP(Name("Any").CompileTimeConstantInput("reduction_indices"),                 AnyOp); +class EuclideanNormOp : public XlaReductionOp {+ public:+  explicit EuclideanNormOp(OpKernelConstruction* ctx)+      : XlaReductionOp(ctx,+                       XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {+    return xla::Zero(builder, xla_reduction_type_);+  }++  xla::XlaOp ProcessData(xla::XlaBuilder* /*builder*/,+                         const xla::XlaOp& data) override {+    return xla::Mul(data, MaybeConjugate(data, true));+  }++  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,+                    const xla::XlaOp& scalar_rhs) override {+    xla::Add(scalar_lhs, scalar_rhs);+  }++  xla::XlaOp BuildFinalizer(+      xla::XlaBuilder* /*builder*/, const xla::XlaOp& input,+      const xla::XlaOp& reduce_output,+      const std::vector<int64>& dimensions_to_reduce) override {+    if (xla::primitive_util::IsIntegralType(xla_reduction_type_)) {

I don't think this is reducing in floating point, it is just doing the final sqrt in floating point.

WindQAQ

comment created time in 2 months

more