Tpetra parallel linear algebra  Version of the Day
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
43 #define TPETRA_MULTIVECTOR_DEF_HPP
44 
52 
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
57 #include "Tpetra_Details_fill.hpp"
58 #include "Tpetra_Details_gathervPrint.hpp"
64 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
65 
66 
67 
68 #include "KokkosCompat_View.hpp"
69 #include "KokkosBlas.hpp"
70 #include "KokkosKernels_Utils.hpp"
71 #include "Kokkos_Random.hpp"
72 
73 #ifdef HAVE_TPETRA_INST_FLOAT128
74 namespace Kokkos {
75  // FIXME (mfh 04 Sep 2015) Just a stub for now!
76  template<class Generator>
77  struct rand<Generator, __float128> {
78  static KOKKOS_INLINE_FUNCTION __float128 max ()
79  {
80  return static_cast<__float128> (1.0);
81  }
82  static KOKKOS_INLINE_FUNCTION __float128
83  draw (Generator& gen)
84  {
85  // Half the smallest normalized double, is the scaling factor of
86  // the lower-order term in the double-double representation.
87  const __float128 scalingFactor =
88  static_cast<__float128> (std::numeric_limits<double>::min ()) /
89  static_cast<__float128> (2.0);
90  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
91  const __float128 lowerOrderTerm =
92  static_cast<__float128> (gen.drand ()) * scalingFactor;
93  return higherOrderTerm + lowerOrderTerm;
94  }
95  static KOKKOS_INLINE_FUNCTION __float128
96  draw (Generator& gen, const __float128& range)
97  {
98  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
99  const __float128 scalingFactor =
100  static_cast<__float128> (std::numeric_limits<double>::min ()) /
101  static_cast<__float128> (2.0);
102  const __float128 higherOrderTerm =
103  static_cast<__float128> (gen.drand (range));
104  const __float128 lowerOrderTerm =
105  static_cast<__float128> (gen.drand (range)) * scalingFactor;
106  return higherOrderTerm + lowerOrderTerm;
107  }
108  static KOKKOS_INLINE_FUNCTION __float128
109  draw (Generator& gen, const __float128& start, const __float128& end)
110  {
111  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
112  const __float128 scalingFactor =
113  static_cast<__float128> (std::numeric_limits<double>::min ()) /
114  static_cast<__float128> (2.0);
115  const __float128 higherOrderTerm =
116  static_cast<__float128> (gen.drand (start, end));
117  const __float128 lowerOrderTerm =
118  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
119  return higherOrderTerm + lowerOrderTerm;
120  }
121  };
122 } // namespace Kokkos
123 #endif // HAVE_TPETRA_INST_FLOAT128
124 
125 namespace { // (anonymous)
126 
141  template<class ST, class LO, class GO, class NT>
143  allocDualView (const size_t lclNumRows,
144  const size_t numCols,
145  const bool zeroOut = true,
146  const bool allowPadding = false)
147  {
148  using ::Tpetra::Details::Behavior;
149  using Kokkos::AllowPadding;
150  using Kokkos::view_alloc;
151  using Kokkos::WithoutInitializing;
152  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
153  typedef typename dual_view_type::t_dev dev_view_type;
154  // This needs to be a string and not a char*, if given as an
155  // argument to Kokkos::view_alloc. This is because view_alloc
156  // also allows a raw pointer as its first argument. See
157  // https://github.com/kokkos/kokkos/issues/434.
158  const std::string label ("MV::DualView");
159  const bool debug = Behavior::debug ();
160 
161  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
162  // creation of the DualView's Views works around
163  // Kokkos::DualView's current inability to accept an
164  // AllocationProperties initial argument (as Kokkos::View does).
165  // However, the work-around is harmless, since it does what the
166  // (currently nonexistent) equivalent DualView constructor would
167  // have done anyway.
168 
169  dev_view_type d_view;
170  if (zeroOut) {
171  if (allowPadding) {
172  d_view = dev_view_type (view_alloc (label, AllowPadding),
173  lclNumRows, numCols);
174  }
175  else {
176  d_view = dev_view_type (view_alloc (label),
177  lclNumRows, numCols);
178  }
179  }
180  else {
181  if (allowPadding) {
182  d_view = dev_view_type (view_alloc (label,
183  WithoutInitializing,
184  AllowPadding),
185  lclNumRows, numCols);
186  }
187  else {
188  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
189  lclNumRows, numCols);
190  }
191  if (debug) {
192  // Filling with NaN is a cheap and effective way to tell if
193  // downstream code is trying to use a MultiVector's data
194  // without them having been initialized. ArithTraits lets us
195  // call nan() even if the scalar type doesn't define it; it
196  // just returns some undefined value in the latter case. This
197  // won't hurt anything because by setting zeroOut=false, users
198  // already agreed that they don't care about the contents of
199  // the MultiVector.
200  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
201  KokkosBlas::fill (d_view, nan);
202  }
203  }
204  if (debug) {
205  TEUCHOS_TEST_FOR_EXCEPTION
206  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
207  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
208  "allocDualView: d_view's dimensions actual dimensions do not match "
209  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
210  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
211  << ". Please report this bug to the Tpetra developers.");
212  }
213 
214  typename dual_view_type::t_host h_view = Kokkos::create_mirror_view (d_view);
215 
216  dual_view_type dv (d_view, h_view);
217  // Whether or not the user cares about the initial contents of the
218  // MultiVector, the device and host views are out of sync. We
219  // prefer to work in device memory. The way to ensure this
220  // happens is to mark the device view as modified.
221  dv.template modify<typename dev_view_type::memory_space> ();
222 
223  return dv;
224  }
225 
226  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
227  //
228  // T: The type of the entries of the View.
229  // ExecSpace: The Kokkos execution space.
230  template<class T, class ExecSpace>
231  struct MakeUnmanagedView {
232  // The 'false' part of the branch carefully ensures that this
233  // won't attempt to use a host execution space that hasn't been
234  // initialized. For example, if Kokkos::OpenMP is disabled and
235  // Kokkos::Threads is enabled, the latter is always the default
236  // execution space of Kokkos::HostSpace, even when ExecSpace is
237  // Kokkos::Serial. That's why we go through the trouble of asking
238  // Kokkos::DualView what _its_ space is. That seems to work
239  // around this default execution space issue.
240  //
241  // NOTE (mfh 29 Jan 2016): See kokkos/kokkos#178 for why we use
242  // a memory space, rather than an execution space, as the first
243  // argument of VerifyExecutionCanAccessMemorySpace.
244  typedef typename Kokkos::Impl::if_c<
245  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
246  typename ExecSpace::memory_space,
247  Kokkos::HostSpace>::value,
248  typename ExecSpace::device_type,
249  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
250  typedef Kokkos::LayoutLeft array_layout;
251  typedef Kokkos::View<T*, array_layout, host_exec_space,
252  Kokkos::MemoryUnmanaged> view_type;
253 
254  static view_type getView (const Teuchos::ArrayView<T>& x_in)
255  {
256  const size_t numEnt = static_cast<size_t> (x_in.size ());
257  if (numEnt == 0) {
258  return view_type ();
259  } else {
260  return view_type (x_in.getRawPtr (), numEnt);
261  }
262  }
263  };
264 
265  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
266  // taking a subview of a 0 x N DualView incorrectly always results
267  // in a 0 x 0 DualView.
268  template<class DualViewType>
269  DualViewType
270  takeSubview (const DualViewType& X,
271  const Kokkos::Impl::ALL_t&,
272  const std::pair<size_t, size_t>& colRng)
273  {
274  if (X.extent (0) == 0 && X.extent (1) != 0) {
275  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
276  }
277  else {
278  return subview (X, Kokkos::ALL (), colRng);
279  }
280  }
281 
282  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
283  // taking a subview of a 0 x N DualView incorrectly always results
284  // in a 0 x 0 DualView.
285  template<class DualViewType>
286  DualViewType
287  takeSubview (const DualViewType& X,
288  const std::pair<size_t, size_t>& rowRng,
289  const std::pair<size_t, size_t>& colRng)
290  {
291  if (X.extent (0) == 0 && X.extent (1) != 0) {
292  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
293  }
294  else {
295  return subview (X, rowRng, colRng);
296  }
297  }
298 } // namespace (anonymous)
299 
300 
301 namespace Tpetra {
302 namespace Classes {
303 
304  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
305  bool
306  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
307  vectorIndexOutOfRange (const size_t VectorIndex) const {
308  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
309  }
310 
311  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
314  base_type (Teuchos::rcp (new map_type ()))
315  {}
316 
317  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
319  MultiVector (const Teuchos::RCP<const map_type>& map,
320  const size_t numVecs,
321  const bool zeroOut) : /* default is true */
322  base_type (map)
323  {
324  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
325 
326  const size_t lclNumRows = this->getLocalLength ();
327  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
328  origView_ = view_;
329  }
330 
331  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
334  base_type (source),
335  view_ (source.view_),
336  origView_ (source.origView_),
337  whichVectors_ (source.whichVectors_)
338  {}
339 
340  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
343  const Teuchos::DataAccess copyOrView) :
344  base_type (source),
345  view_ (source.view_),
346  origView_ (source.origView_),
347  whichVectors_ (source.whichVectors_)
348  {
350  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
351  "const Teuchos::DataAccess): ";
352 
353  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
354 
355  if (copyOrView == Teuchos::Copy) {
356  // Reuse the conveniently already existing function that creates
357  // a deep copy.
358  MV cpy = createCopy (source);
359  this->view_ = cpy.view_;
360  this->origView_ = cpy.origView_;
361  this->whichVectors_ = cpy.whichVectors_;
362  }
363  else if (copyOrView == Teuchos::View) {
364  }
365  else {
366  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
367  true, std::invalid_argument, "Second argument 'copyOrView' has an "
368  "invalid value " << copyOrView << ". Valid values include "
369  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
370  << Teuchos::View << ".");
371  }
372  }
373 
374  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
376  MultiVector (const Teuchos::RCP<const map_type>& map,
377  const dual_view_type& view) :
378  base_type (map),
379  view_ (view),
380  origView_ (view)
381  {
382  const char tfecfFuncName[] = "MultiVector(map,view): ";
383 
384  // Get stride of view: if second dimension is 0, the
385  // stride might be 0, so take view_dimension instead.
386  size_t stride[8];
387  origView_.stride (stride);
388  const size_t LDA = (origView_.extent (1) > 1) ? stride[1] :
389  origView_.extent (0);
390  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
391  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
392  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
393  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
394  << ". This may also mean that the input view's first dimension (number "
395  "of rows = " << view.extent (0) << ") does not not match the number "
396  "of entries in the Map on the calling process.");
397  }
398 
399 
400  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
402  MultiVector (const Teuchos::RCP<const map_type>& map,
403  const typename dual_view_type::t_dev& d_view) :
404  base_type (map)
405  {
406  using Teuchos::ArrayRCP;
407  using Teuchos::RCP;
408  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
409 
410  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
411 
412  // Get stride of view: if second dimension is 0, the stride might
413  // be 0, so take view_dimension instead.
414  size_t stride[8];
415  d_view.stride (stride);
416  const size_t LDA = (d_view.extent (1) > 1) ? stride[1] :
417  d_view.extent (0);
418  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
419  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
420  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::View's "
421  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
422  << ". This may also mean that the input view's first dimension (number "
423  "of rows = " << d_view.extent (0) << ") does not not match the "
424  "number of entries in the Map on the calling process.");
425 
426  // The difference between create_mirror and create_mirror_view, is
427  // that the latter copies to host. We don't necessarily want to
428  // do that; we just want to allocate the memory.
429  view_ = dual_view_type (d_view, Kokkos::create_mirror (d_view));
430  // The user gave us a device view. We take it as canonical, which
431  // means we mark it as "modified," so that the next sync will
432  // synchronize it with the host view.
433  this->template modify<device_type> ();
434  origView_ = view_;
435  }
436 
437  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
439  MultiVector (const Teuchos::RCP<const map_type>& map,
440  const dual_view_type& view,
441  const dual_view_type& origView) :
442  base_type (map),
443  view_ (view),
444  origView_ (origView)
445  {
446  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
447 
448  // Get stride of view: if second dimension is 0, the
449  // stride might be 0, so take view_dimension instead.
450  size_t stride[8];
451  origView_.stride (stride);
452  const size_t LDA = (origView_.extent (1) > 1) ? stride[1] :
453  origView_.extent (0);
454  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
455  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
456  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
457  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
458  << ". This may also mean that the input origView's first dimension (number "
459  "of rows = " << origView.extent (0) << ") does not not match the number "
460  "of entries in the Map on the calling process.");
461  }
462 
463 
464  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
466  MultiVector (const Teuchos::RCP<const map_type>& map,
467  const dual_view_type& view,
468  const Teuchos::ArrayView<const size_t>& whichVectors) :
469  base_type (map),
470  view_ (view),
471  origView_ (view),
472  whichVectors_ (whichVectors.begin (), whichVectors.end ())
473  {
474  using Kokkos::ALL;
475  using Kokkos::subview;
476  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
477 
478  const size_t lclNumRows = map.is_null () ? size_t (0) :
479  map->getNodeNumElements ();
480  // Check dimensions of the input DualView. We accept that Kokkos
481  // might not allow construction of a 0 x m (Dual)View with m > 0,
482  // so we only require the number of rows to match if the
483  // (Dual)View has more than zero columns. Likewise, we only
484  // require the number of columns to match if the (Dual)View has
485  // more than zero rows.
486  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
487  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
488  std::invalid_argument, "view.extent(0) = " << view.extent (0)
489  << " < map->getNodeNumElements() = " << lclNumRows << ".");
490  if (whichVectors.size () != 0) {
491  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
492  view.extent (1) != 0 && view.extent (1) == 0,
493  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
494  " = " << whichVectors.size () << " > 0.");
495  size_t maxColInd = 0;
496  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
497  for (size_type k = 0; k < whichVectors.size (); ++k) {
498  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
499  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
500  std::invalid_argument, "whichVectors[" << k << "] = "
501  "Teuchos::OrdinalTraits<size_t>::invalid().");
502  maxColInd = std::max (maxColInd, whichVectors[k]);
503  }
504  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
505  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
506  std::invalid_argument, "view.extent(1) = " << view.extent (1)
507  << " <= max(whichVectors) = " << maxColInd << ".");
508  }
509 
510  // Get stride of view: if second dimension is 0, the
511  // stride might be 0, so take view_dimension instead.
512  size_t stride[8];
513  origView_.stride (stride);
514  const size_t LDA = (origView_.extent (1) > 1) ? stride[1] :
515  origView_.extent (0);
516  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
517  LDA < lclNumRows, std::invalid_argument,
518  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
519 
520  if (whichVectors.size () == 1) {
521  // If whichVectors has only one entry, we don't need to bother
522  // with nonconstant stride. Just shift the view over so it
523  // points to the desired column.
524  //
525  // NOTE (mfh 10 May 2014) This is a special case where we set
526  // origView_ just to view that one column, not all of the
527  // original columns. This ensures that the use of origView_ in
528  // offsetView works correctly.
529  const std::pair<size_t, size_t> colRng (whichVectors[0],
530  whichVectors[0] + 1);
531  view_ = takeSubview (view_, ALL (), colRng);
532  origView_ = takeSubview (origView_, ALL (), colRng);
533  // whichVectors_.size() == 0 means "constant stride."
534  whichVectors_.clear ();
535  }
536  }
537 
538  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
540  MultiVector (const Teuchos::RCP<const map_type>& map,
541  const dual_view_type& view,
542  const dual_view_type& origView,
543  const Teuchos::ArrayView<const size_t>& whichVectors) :
544  base_type (map),
545  view_ (view),
546  origView_ (origView),
547  whichVectors_ (whichVectors.begin (), whichVectors.end ())
548  {
549  using Kokkos::ALL;
550  using Kokkos::subview;
551  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
552 
553  const size_t lclNumRows = this->getLocalLength ();
554  // Check dimensions of the input DualView. We accept that Kokkos
555  // might not allow construction of a 0 x m (Dual)View with m > 0,
556  // so we only require the number of rows to match if the
557  // (Dual)View has more than zero columns. Likewise, we only
558  // require the number of columns to match if the (Dual)View has
559  // more than zero rows.
560  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
561  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
562  std::invalid_argument, "view.extent(0) = " << view.extent (0)
563  << " < map->getNodeNumElements() = " << lclNumRows << ".");
564  if (whichVectors.size () != 0) {
565  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
566  view.extent (1) != 0 && view.extent (1) == 0,
567  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
568  " = " << whichVectors.size () << " > 0.");
569  size_t maxColInd = 0;
570  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
571  for (size_type k = 0; k < whichVectors.size (); ++k) {
572  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
573  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
574  std::invalid_argument, "whichVectors[" << k << "] = "
575  "Teuchos::OrdinalTraits<size_t>::invalid().");
576  maxColInd = std::max (maxColInd, whichVectors[k]);
577  }
578  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
579  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
580  std::invalid_argument, "view.extent(1) = " << view.extent (1)
581  << " <= max(whichVectors) = " << maxColInd << ".");
582  }
583  // Get stride of view: if second dimension is 0, the
584  // stride might be 0, so take view_dimension instead.
585  size_t stride[8];
586  origView_.stride (stride);
587  const size_t LDA = (origView_.extent (1) > 1) ? stride[1] :
588  origView_.extent (0);
589  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
590  LDA < lclNumRows, std::invalid_argument, "Input DualView's column stride"
591  " = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
592 
593  if (whichVectors.size () == 1) {
594  // If whichVectors has only one entry, we don't need to bother
595  // with nonconstant stride. Just shift the view over so it
596  // points to the desired column.
597  //
598  // NOTE (mfh 10 May 2014) This is a special case where we set
599  // origView_ just to view that one column, not all of the
600  // original columns. This ensures that the use of origView_ in
601  // offsetView works correctly.
602  const std::pair<size_t, size_t> colRng (whichVectors[0],
603  whichVectors[0] + 1);
604  view_ = takeSubview (view_, ALL (), colRng);
605  origView_ = takeSubview (origView_, ALL (), colRng);
606  // whichVectors_.size() == 0 means "constant stride."
607  whichVectors_.clear ();
608  }
609  }
610 
611  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
613  MultiVector (const Teuchos::RCP<const map_type>& map,
614  const Teuchos::ArrayView<const Scalar>& data,
615  const size_t LDA,
616  const size_t numVecs) :
617  base_type (map)
618  {
619  typedef LocalOrdinal LO;
620  typedef GlobalOrdinal GO;
621  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
622  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
623 
624  // Deep copy constructor, constant stride (NO whichVectors_).
625  // There is no need for a deep copy constructor with nonconstant stride.
626 
627  const size_t lclNumRows =
628  map.is_null () ? size_t (0) : map->getNodeNumElements ();
629  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
630  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
631  "map->getNodeNumElements() = " << lclNumRows << ".");
632  if (numVecs != 0) {
633  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
634  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
635  (static_cast<size_t> (data.size ()) < minNumEntries,
636  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
637  "entries, given the input Map and number of vectors in the MultiVector."
638  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
639  "map->getNodeNumElements () = " << minNumEntries << ".");
640  }
641 
642  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
643  this->template modify<device_type> ();
644  auto X_out = this->template getLocalView<device_type> ();
645  origView_ = view_;
646 
647  // Make an unmanaged host Kokkos::View of the input data. First
648  // create a View (X_in_orig) with the original stride. Then,
649  // create a subview (X_in) with the right number of columns.
650  const impl_scalar_type* const X_in_raw =
651  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
652  Kokkos::View<const impl_scalar_type**,
653  Kokkos::LayoutLeft,
654  Kokkos::HostSpace,
655  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
656  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
657  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
658 
659  // If LDA != X_out's column stride, then we need to copy one
660  // column at a time; Kokkos::deep_copy refuses to work in that
661  // case.
662  size_t outStrides[8];
663  X_out.stride (outStrides);
664  const size_t outStride = (X_out.extent (1) > 1) ? outStrides[1] :
665  X_out.extent (0);
666  if (LDA == outStride) { // strides are the same; deep_copy once
667  // This only works because MultiVector uses LayoutLeft.
668  // We would need a custom copy functor otherwise.
669  Kokkos::deep_copy (X_out, X_in);
670  }
671  else { // strides differ; copy one column at a time
672  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
673  out_col_view_type;
674  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
675  in_col_view_type;
676  for (size_t j = 0; j < numVecs; ++j) {
677  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
678  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
679  Kokkos::deep_copy (X_out_j, X_in_j);
680  }
681  }
682  }
683 
684  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
686  MultiVector (const Teuchos::RCP<const map_type>& map,
687  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
688  const size_t numVecs) :
689  base_type (map)
690  {
691  typedef impl_scalar_type IST;
692  typedef LocalOrdinal LO;
693  typedef GlobalOrdinal GO;
694  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
695  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
696 
697  const size_t lclNumRows =
698  map.is_null () ? size_t (0) : map->getNodeNumElements ();
699  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
700  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
701  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
702  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
703  for (size_t j = 0; j < numVecs; ++j) {
704  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
705  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
706  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
707  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
708  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
709  << ".");
710  }
711 
712  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
713  this->template modify<device_type> ();
714  auto X_out = this->getLocalView<device_type> ();
715 
716  // Make sure that the type of a single input column has the same
717  // array layout as each output column does, so we can deep_copy.
718  typedef typename decltype (X_out)::array_layout array_layout;
719  typedef typename Kokkos::View<const IST*,
720  array_layout,
721  Kokkos::HostSpace,
722  Kokkos::MemoryUnmanaged> input_col_view_type;
723 
724  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
725  for (size_t j = 0; j < numVecs; ++j) {
726  Teuchos::ArrayView<const IST> X_j_av =
727  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
728  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
729  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
730  Kokkos::deep_copy (X_j_out, X_j_in);
731  }
732  origView_ = view_;
733  }
734 
735  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
738 
739  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
742  return whichVectors_.empty ();
743  }
744 
745  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
746  size_t
749  {
750  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
751  return static_cast<size_t> (0);
752  } else {
753  return this->getMap ()->getNodeNumElements ();
754  }
755  }
756 
757  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
761  {
762  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
763  return static_cast<size_t> (0);
764  } else {
765  return this->getMap ()->getGlobalNumElements ();
766  }
767  }
768 
769  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
770  size_t
772  getStride () const
773  {
774  if (isConstantStride ()) {
775  // Get stride of view: if second dimension is 0, the
776  // stride might be 0, so take view_dimension instead.
777  size_t stride[8];
778  origView_.stride (stride);
779  const size_t LDA = (origView_.extent (1) > 1) ? stride[1] :
780  origView_.extent (0);
781  return LDA;
782  }
783  else {
784  return static_cast<size_t> (0);
785  }
786  }
787 
788  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
789  bool
791  checkSizes (const SrcDistObject& sourceObj)
792  {
793  // Check whether the source object is a MultiVector. If not, then
794  // we can't even compare sizes, so it's definitely not OK to
795  // Import or Export from it.
797  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
798  if (src == NULL) {
799  return false;
800  } else {
801  // The target of the Import or Export calls checkSizes() in
802  // DistObject::doTransfer(). By that point, we've already
803  // constructed an Import or Export object using the two
804  // multivectors' Maps, which means that (hopefully) we've
805  // already checked other attributes of the multivectors. Thus,
806  // all we need to do here is check the number of columns.
807  return src->getNumVectors () == this->getNumVectors ();
808  }
809  }
810 
811  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
812  size_t
815  return this->getNumVectors ();
816  }
817 
818  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
819  void
821  copyAndPermuteNew (const SrcDistObject& sourceObj,
822  const size_t numSameIDs,
823  const Kokkos::DualView<const LocalOrdinal*, device_type>& permuteToLIDs,
824  const Kokkos::DualView<const LocalOrdinal*, device_type>& permuteFromLIDs)
825  {
826  using ::Tpetra::Details::Behavior;
829  using ::Tpetra::Details::ProfilingRegion;
830  using KokkosRefactor::Details::permute_array_multi_column;
831  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
832  using Kokkos::Compat::create_const_view;
833  typedef typename device_type::memory_space DMS;
834  typedef Kokkos::HostSpace HMS;
836  const char tfecfFuncName[] = "copyAndPermuteNew: ";
837  ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
838 
839  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
840  // environment variable to "1" (or "TRUE") for copious debug
841  // output to std::cerr on every MPI process. This is unwise for
842  // runs with large numbers of MPI processes.
843  const bool debug = Behavior::verbose ();
844  int myRank = 0;
845  if (debug && ! this->getMap ().is_null () &&
846  ! this->getMap ()->getComm ().is_null ()) {
847  myRank = this->getMap ()->getComm ()->getRank ();
848  }
849 
850  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
851  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
852  std::runtime_error, "permuteToLIDs.extent(0) = "
853  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
854  << permuteFromLIDs.extent (0) << ".");
855 
856  // We've already called checkSizes(), so this cast must succeed.
857  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
858  const size_t numCols = this->getNumVectors ();
859 
860  // The input sourceObj controls whether we copy on host or device.
861  // This is because this method is not allowed to modify sourceObj,
862  // so it may not sync sourceObj; it must use the most recently
863  // updated version (host or device) of sourceObj's data.
864  //
865  // If we need sync to device, then host has the most recent version.
866  const bool copyOnHost = sourceMV.template need_sync<DMS> ();
867  if (debug) {
868  std::ostringstream os;
869  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
870  "copyOnHost=" << (copyOnHost ? "true" : "false") << std::endl;
871  std::cerr << os.str ();
872  }
873 
874  if (copyOnHost) {
875  if(this->template need_sync<HMS> ()) this->template sync<HMS> ();
876  this->template modify<HMS> ();
877  }
878  else {
879  if(this->template need_sync<DMS> ()) this->template sync<DMS> ();
880  this->template modify<DMS> ();
881  }
882 
883  if (debug) {
884  std::ostringstream os;
885  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
886  "Copy source to target" << std::endl;
887  std::cerr << os.str ();
888  }
889 
890  // TODO (mfh 15 Sep 2013) When we replace
891  // KokkosClassic::MultiVector with a Kokkos::View, there are two
892  // ways to copy the data:
893  //
894  // 1. Get a (sub)view of each column and call deep_copy on that.
895  // 2. Write a custom kernel to copy the data.
896  //
897  // The first is easier, but the second might be more performant in
898  // case we decide to use layouts other than LayoutLeft. It might
899  // even make sense to hide whichVectors_ in an entirely new layout
900  // for Kokkos Views.
901 
902  // Copy rows [0, numSameIDs-1] of the local multivectors.
903  //
904  // Note (ETP 2 Jul 2014) We need to always copy one column at a
905  // time, even when both multivectors are constant-stride, since
906  // deep_copy between strided subviews with more than one column
907  // doesn't currently work.
908 
909  if (numSameIDs > 0) {
910  const std::pair<size_t, size_t> rows (0, numSameIDs);
911  if (copyOnHost) {
912  auto tgt_h = this->template getLocalView<HMS> ();
913  auto src_h = create_const_view (sourceMV.template getLocalView<HMS> ());
914 
915  for (size_t j = 0; j < numCols; ++j) {
916  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
917  const size_t srcCol =
918  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
919 
920  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
921  auto src_j = Kokkos::subview (src_h, rows, srcCol);
922  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
923  }
924  }
925  else { // copy on device
926  auto tgt_d = this->template getLocalView<DMS> ();
927  auto src_d = create_const_view (sourceMV.template getLocalView<DMS> ());
928 
929  for (size_t j = 0; j < numCols; ++j) {
930  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
931  const size_t srcCol =
932  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
933 
934  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
935  auto src_j = Kokkos::subview (src_d, rows, srcCol);
936  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
937  }
938  }
939  }
940 
941  // For the remaining GIDs, execute the permutations. This may
942  // involve noncontiguous access of both source and destination
943  // vectors, depending on the LID lists.
944  //
945  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
946  // the same process, this merges their values by replacement of
947  // the last encountered GID, not by the specified merge rule
948  // (such as ADD).
949 
950  // If there are no permutations, we are done
951  if (permuteFromLIDs.extent (0) == 0 ||
952  permuteToLIDs.extent (0) == 0) {
953  if (debug) {
954  std::ostringstream os;
955  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
956  "No permutations; done." << std::endl;
957  std::cerr << os.str ();
958  }
959  return;
960  }
961 
962  if (debug) {
963  std::ostringstream os;
964  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
965  "Permute source to target" << std::endl;
966  std::cerr << os.str ();
967  }
968 
969  // mfh 29 Aug 2017: Kokkos::DualView annoyingly forbids (at run
970  // time, no less!) sync'ing a DualView<const T, D>. We work
971  // around this by managing the copying and flags ourselves --
972  // essentially reimplementing sync and modify.
973  Kokkos::DualView<LocalOrdinal*, device_type> permuteToLIDs_nc =
974  castAwayConstDualView (permuteToLIDs);
975  Kokkos::DualView<LocalOrdinal*, device_type> permuteFromLIDs_nc =
976  castAwayConstDualView (permuteFromLIDs);
977 
978  // We could in theory optimize for the case where exactly one of
979  // them is constant stride, but we don't currently do that.
980  const bool nonConstStride =
981  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
982 
983  if (debug) {
984  std::ostringstream os;
985  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
986  "nonConstStride=" << (nonConstStride ? "true" : "false") << std::endl;
987  std::cerr << os.str ();
988  }
989 
990  // We only need the "which vectors" arrays if either the source or
991  // target MV is not constant stride. Since we only have one
992  // kernel that must do double-duty, we have to create a "which
993  // vectors" array for the MV that _is_ constant stride.
994  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
995  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
996  if (nonConstStride) {
997  if (this->whichVectors_.size () == 0) {
998  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
999  tmpTgt.template modify<HMS> ();
1000  for (size_t j = 0; j < numCols; ++j) {
1001  tmpTgt.h_view(j) = j;
1002  }
1003  if (! copyOnHost) {
1004  tmpTgt.template sync<DMS> ();
1005  }
1006  tgtWhichVecs = tmpTgt;
1007  }
1008  else {
1009  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1010  tgtWhichVecs =
1011  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1012  "tgtWhichVecs",
1013  copyOnHost);
1014  }
1015  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1016  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1017  this->getNumVectors (),
1018  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1019  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1020  this->getNumVectors () << ".");
1021 
1022  if (sourceMV.whichVectors_.size () == 0) {
1023  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1024  tmpSrc.template modify<HMS> ();
1025  for (size_t j = 0; j < numCols; ++j) {
1026  tmpSrc.h_view(j) = j;
1027  }
1028  if (! copyOnHost) {
1029  tmpSrc.template sync<DMS> ();
1030  }
1031  srcWhichVecs = tmpSrc;
1032  }
1033  else {
1034  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1035  sourceMV.whichVectors_ ();
1036  srcWhichVecs =
1037  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1038  "srcWhichVecs",
1039  copyOnHost);
1040  }
1041  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1042  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1043  sourceMV.getNumVectors (), std::logic_error,
1044  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1045  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1046  << ".");
1047  }
1048 
1049  if (copyOnHost) { // permute on host too
1050  if (debug) {
1051  std::ostringstream os;
1052  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
1053  "Set up permutation arrays on host" << std::endl;
1054  std::cerr << os.str ();
1055  }
1056  auto tgt_h = this->template getLocalView<HMS> ();
1057  auto src_h = create_const_view (sourceMV.template getLocalView<HMS> ());
1058  permuteToLIDs_nc.template sync<HMS> ();
1059  auto permuteToLIDs_h =
1060  create_const_view (permuteToLIDs_nc.template view<HMS> ());
1061  permuteFromLIDs_nc.template sync<HMS> ();
1062  auto permuteFromLIDs_h =
1063  create_const_view (permuteFromLIDs_nc.template view<HMS> ());
1064 
1065  if (debug) {
1066  std::ostringstream os;
1067  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
1068  "Call permute kernel on host" << std::endl;
1069  std::cerr << os.str ();
1070  }
1071  if (nonConstStride) {
1072  // No need to sync first, because copyOnHost argument to
1073  // getDualViewCopyFromArrayView puts them in the right place.
1074  auto tgtWhichVecs_h =
1075  create_const_view (tgtWhichVecs.template view<HMS> ());
1076  auto srcWhichVecs_h =
1077  create_const_view (srcWhichVecs.template view<HMS> ());
1078  permute_array_multi_column_variable_stride (tgt_h, src_h,
1079  permuteToLIDs_h,
1080  permuteFromLIDs_h,
1081  tgtWhichVecs_h,
1082  srcWhichVecs_h, numCols);
1083  }
1084  else {
1085  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1086  permuteFromLIDs_h, numCols);
1087  }
1088  }
1089  else { // permute on device
1090  if (debug) {
1091  std::ostringstream os;
1092  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
1093  "Set up permutation arrays on device" << std::endl;
1094  std::cerr << os.str ();
1095  }
1096  auto tgt_d = this->template getLocalView<DMS> ();
1097  auto src_d = create_const_view (sourceMV.template getLocalView<DMS> ());
1098  permuteToLIDs_nc.template sync<DMS> ();
1099  auto permuteToLIDs_d =
1100  create_const_view (permuteToLIDs_nc.template view<DMS> ());
1101  permuteFromLIDs_nc.template sync<DMS> ();
1102  auto permuteFromLIDs_d =
1103  create_const_view (permuteFromLIDs_nc.template view<DMS> ());
1104 
1105  if (debug) {
1106  std::ostringstream os;
1107  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: "
1108  "Call permute kernel on device" << std::endl;
1109  std::cerr << os.str ();
1110  }
1111  if (nonConstStride) {
1112  // No need to sync first, because copyOnHost argument to
1113  // getDualViewCopyFromArrayView puts them in the right place.
1114  auto tgtWhichVecs_d =
1115  create_const_view (tgtWhichVecs.template view<DMS> ());
1116  auto srcWhichVecs_d =
1117  create_const_view (srcWhichVecs.template view<DMS> ());
1118  permute_array_multi_column_variable_stride (tgt_d, src_d,
1119  permuteToLIDs_d,
1120  permuteFromLIDs_d,
1121  tgtWhichVecs_d,
1122  srcWhichVecs_d, numCols);
1123  }
1124  else {
1125  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1126  permuteFromLIDs_d, numCols);
1127  }
1128  }
1129 
1130  if (debug) {
1131  std::ostringstream os;
1132  os << "(Proc " << myRank << ") MV::copyAndPermuteNew: Done" << std::endl;
1133  std::cerr << os.str ();
1134  }
1135  }
1136 
1137  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1138  void
1139  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1140  packAndPrepareNew (const SrcDistObject& sourceObj,
1141  const Kokkos::DualView<const local_ordinal_type*, device_type>& exportLIDs,
1142  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1143  const Kokkos::DualView<size_t*, buffer_device_type>& /* numExportPacketsPerLID */,
1144  size_t& constantNumPackets,
1145  Distributor & /* distor */ )
1146  {
1147  using ::Tpetra::Details::Behavior;
1148  using ::Tpetra::Details::ProfilingRegion;
1149  using Kokkos::Compat::create_const_view;
1150  using Kokkos::Compat::getKokkosViewDeepCopy;
1151  typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
1152  typedef impl_scalar_type IST;
1153  typedef Kokkos::HostSpace host_memory_space;
1154  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::memory_space
1155  dev_memory_space;
1156  typedef typename Kokkos::DualView<IST*, device_type>::t_host::execution_space
1157  host_execution_space;
1158  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::execution_space
1159  dev_execution_space;
1160  ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1161 
1162  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1163  // have the option to check indices. We do so when Tpetra is in
1164  // debug mode. It is in debug mode by default in a debug build,
1165  // but you may control this at run time, before launching the
1166  // executable, by setting the TPETRA_DEBUG environment variable to
1167  // "1" (or "TRUE").
1168  const bool debugCheckIndices = Behavior::debug ();
1169  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1170  // environment variable to "1" (or "TRUE") for copious debug
1171  // output to std::cerr on every MPI process. This is unwise for
1172  // runs with large numbers of MPI processes.
1173  const bool printDebugOutput = Behavior::verbose ();
1174 
1175  int myRank = 0;
1176  if (printDebugOutput && ! this->getMap ().is_null () &&
1177  ! this->getMap ()->getComm ().is_null ()) {
1178  myRank = this->getMap ()->getComm ()->getRank ();
1179  }
1180 
1181  if (printDebugOutput) {
1182  std::ostringstream os;
1183  os << "Proc " << myRank << ": MV::packAndPrepareNew" << std::endl;
1184  std::cerr << os.str ();
1185  }
1186  // We've already called checkSizes(), so this cast must succeed.
1187  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
1188 
1189  // mfh 12 Apr 2016: packAndPrepareNew decides where to pack based
1190  // on the memory space in which exportLIDs was last modified.
1191  // (DistObject::doTransferNew decides this currently.)
1192  //
1193  // We unfortunately can't change the source object sourceMV.
1194  // Thus, we can't sync it to the memory space where we want to
1195  // pack communication buffers. As a result, for example, if
1196  // exportLIDs wants us to pack on host, but sourceMV was last
1197  // modified on device, we have to allocate a temporary host
1198  // version and copy back to host before we can pack. Similarly,
1199  // if exportLIDs wants us to pack on device, but sourceMV was last
1200  // modified on host, we have to allocate a temporary device
1201  // version and copy back to device before we can pack.
1202  const bool packOnHost =
1203  exportLIDs.modified_host () > exportLIDs.modified_device ();
1204  auto src_dev = sourceMV.template getLocalView<dev_memory_space> ();
1205  auto src_host = sourceMV.template getLocalView<host_memory_space> ();
1206  if (packOnHost) {
1207  if (sourceMV.template need_sync<Kokkos::HostSpace> ()) {
1208  if (printDebugOutput) {
1209  std::ostringstream os;
1210  os << "Proc " << myRank << ": MV::packAndPrepareNew: "
1211  "Pack on host, need sync to host" << std::endl;
1212  std::cerr << os.str ();
1213  }
1214  // sourceMV was most recently updated on device; copy to host.
1215  // Allocate a new host mirror. We'll use it for packing below.
1216  src_host = decltype (src_host) ("MV::DualView::h_view",
1217  src_dev.extent (0),
1218  src_dev.extent (1));
1219  Kokkos::deep_copy (src_host, src_dev);
1220  }
1221  }
1222  else { // pack on device
1223  if (sourceMV.template need_sync<device_type> ()) {
1224  if (printDebugOutput) {
1225  std::ostringstream os;
1226  os << "Proc " << myRank << ": MV::packAndPrepareNew: "
1227  "Pack on device, need sync to device" << std::endl;
1228  std::cerr << os.str ();
1229  }
1230  // sourceMV was most recently updated on host; copy to device.
1231  // Allocate a new "device mirror." We'll use it for packing below.
1232  src_dev = decltype (src_dev) ("MV::DualView::d_view",
1233  src_host.extent (0),
1234  src_host.extent (1));
1235  Kokkos::deep_copy (src_dev, src_host);
1236  }
1237  }
1238 
1239  const size_t numCols = sourceMV.getNumVectors ();
1240 
1241  // This spares us from needing to fill numExportPacketsPerLID.
1242  // Setting constantNumPackets to a nonzero value signals that
1243  // all packets have the same number of entries.
1244  constantNumPackets = numCols;
1245 
1246  // If we have no exports, there is nothing to do. Make sure this
1247  // goes _after_ setting constantNumPackets correctly.
1248  if (exportLIDs.extent (0) == 0) {
1249  if (printDebugOutput) {
1250  std::ostringstream os;
1251  os << "Proc " << myRank << ": MV::packAndPrepareNew: "
1252  "No exports on this proc, DONE" << std::endl;
1253  std::cerr << os.str ();
1254  }
1255  return;
1256  }
1257 
1258  /* The layout in the export for MultiVectors is as follows:
1259  exports = { all of the data from row exportLIDs.front() ;
1260  ....
1261  all of the data from row exportLIDs.back() }
1262  This doesn't have the best locality, but is necessary because
1263  the data for a Packet (all data associated with an LID) is
1264  required to be contiguous. */
1265 
1266  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1267  // packing scheme in the above comment? The data going to a
1268  // particular process must be contiguous, of course, but those
1269  // data could include entries from multiple LIDs. DistObject just
1270  // needs to know how to index into that data. Kokkos is good at
1271  // decoupling storage intent from data layout choice.
1272 
1273  const size_t numExportLIDs = exportLIDs.extent (0);
1274  const size_t newExportsSize = numCols * numExportLIDs;
1275  if (printDebugOutput) {
1276  std::ostringstream os;
1277  os << "Proc " << myRank << ": MV::packAndPrepareNew: realloc: "
1278  << "numExportLIDs = " << numExportLIDs
1279  << ", exports.extent(0) = " << exports.extent (0)
1280  << ", newExportsSize = " << newExportsSize << std::endl;
1281  std::cerr << os.str ();
1282  }
1283  ::Tpetra::Details::reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1284 
1285  // 'exports' may have different memory spaces than device_type
1286  // would indicate. See GitHub issue #1088. Abbreviations:
1287  // "exports host memory space" and "exports device memory spaces."
1288  typedef typename std::decay<decltype (exports) >::type::t_host::memory_space EHMS;
1289  typedef typename std::decay<decltype (exports) >::type::t_dev::memory_space EDMS;
1290 
1291  // Mark 'exports' here, since we might have resized it above.
1292  // Resizing currently requires calling the constructor, which
1293  // clears out the 'modified' flags.
1294  if (packOnHost) {
1295  exports.template modify<EHMS> ();
1296  }
1297  else {
1298  exports.template modify<EDMS> ();
1299  }
1300 
1301  if (numCols == 1) { // special case for one column only
1302  // MultiVector always represents a single column with constant
1303  // stride, but it doesn't hurt to implement both cases anyway.
1304  //
1305  // ETP: I'm not sure I agree with the above statement. Can't a single-
1306  // column multivector be a subview of another multi-vector, in which case
1307  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1308  // separately.
1309  //
1310  // mfh 18 Jan 2016: In answer to ETP's comment above:
1311  // MultiVector treats single-column MultiVectors created using a
1312  // "nonconstant stride constructor" as a special case, and makes
1313  // them constant stride (by making whichVectors_ have length 0).
1314  if (sourceMV.isConstantStride ()) {
1315  using KokkosRefactor::Details::pack_array_single_column;
1316  if (printDebugOutput) {
1317  std::ostringstream os;
1318  os << "Proc " << myRank << ": MV::packAndPrepareNew: "
1319  "pack numCols=1 const stride" << std::endl;
1320  std::cerr << os.str ();
1321  }
1322  if (packOnHost) {
1323  pack_array_single_column (exports.template view<EHMS> (),
1324  create_const_view (src_host),
1325  exportLIDs.template view<host_memory_space> (),
1326  0,
1327  debugCheckIndices);
1328  }
1329  else { // pack on device
1330  pack_array_single_column (exports.template view<EDMS> (),
1331  create_const_view (src_dev),
1332  exportLIDs.template view<dev_memory_space> (),
1333  0,
1334  debugCheckIndices);
1335  }
1336  }
1337  else {
1338  using KokkosRefactor::Details::pack_array_single_column;
1339  if (printDebugOutput) {
1340  std::ostringstream os;
1341  os << "Proc " << myRank << ": MV::packAndPrepareNew: "
1342  "pack numCols=1 nonconst stride" << std::endl;
1343  std::cerr << os.str ();
1344  }
1345  if (packOnHost) {
1346  pack_array_single_column (exports.template view<EHMS> (),
1347  create_const_view (src_host),
1348  exportLIDs.template view<host_memory_space> (),
1349  sourceMV.whichVectors_[0],
1350  debugCheckIndices);
1351  }
1352  else { // pack on device
1353  pack_array_single_column (exports.template view<EDMS> (),
1354  create_const_view (src_dev),
1355  exportLIDs.template view<dev_memory_space> (),
1356  sourceMV.whichVectors_[0],
1357  debugCheckIndices);
1358  }
1359  }
1360  }
1361  else { // the source MultiVector has multiple columns
1362  if (sourceMV.isConstantStride ()) {
1363  using KokkosRefactor::Details::pack_array_multi_column;
1364  if (printDebugOutput) {
1365  std::ostringstream os;
1366  os << "Proc " << myRank << ": MV::packAndPrepareNew: "
1367  "pack numCols>1 const stride" << std::endl;
1368  std::cerr << os.str ();
1369  }
1370  if (packOnHost) {
1371  pack_array_multi_column (exports.template view<EHMS> (),
1372  create_const_view (src_host),
1373  exportLIDs.template view<host_memory_space> (),
1374  numCols,
1375  debugCheckIndices);
1376  }
1377  else { // pack on device
1378  pack_array_multi_column (exports.template view<EDMS> (),
1379  create_const_view (src_dev),
1380  exportLIDs.template view<dev_memory_space> (),
1381  numCols,
1382  debugCheckIndices);
1383  }
1384  }
1385  else {
1386  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1387  if (printDebugOutput) {
1388  std::ostringstream os;
1389  os << "Proc " << myRank << ": MV::packAndPrepareNew: "
1390  "pack numCols>1 nonconst stride" << std::endl;
1391  std::cerr << os.str ();
1392  }
1393  if (packOnHost) {
1394  pack_array_multi_column_variable_stride (exports.template view<EHMS> (),
1395  create_const_view (src_host),
1396  exportLIDs.template view<host_memory_space> (),
1397  getKokkosViewDeepCopy<host_execution_space> (sourceMV.whichVectors_ ()),
1398  numCols,
1399  debugCheckIndices);
1400  }
1401  else { // pack on device
1402  pack_array_multi_column_variable_stride (exports.template view<EDMS> (),
1403  create_const_view (src_dev),
1404  exportLIDs.template view<dev_memory_space> (),
1405  getKokkosViewDeepCopy<dev_execution_space> (sourceMV.whichVectors_ ()),
1406  numCols,
1407  debugCheckIndices);
1408  }
1409  }
1410  }
1411 
1412  if (printDebugOutput) {
1413  std::ostringstream os;
1414  os << "Proc " << myRank << ": MV::packAndPrepareNew: DONE" << std::endl;
1415  std::cerr << os.str ();
1416  }
1417  }
1418 
1419 
1420  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1421  void
1422  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1423  unpackAndCombineNew (const Kokkos::DualView<const local_ordinal_type*, device_type>& importLIDs,
1424  const Kokkos::DualView<const impl_scalar_type*, buffer_device_type>& imports,
1425  const Kokkos::DualView<const size_t*, buffer_device_type>& /* numPacketsPerLID */,
1426  const size_t constantNumPackets,
1427  Distributor& /* distor */,
1428  const CombineMode CM)
1429  {
1430  using ::Tpetra::Details::Behavior;
1431  using ::Tpetra::Details::ProfilingRegion;
1433  using KokkosRefactor::Details::unpack_array_multi_column;
1434  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1435  using Kokkos::Compat::getKokkosViewDeepCopy;
1436  typedef impl_scalar_type IST;
1437  typedef typename Kokkos::DualView<IST*,
1438  device_type>::t_dev::memory_space DMS;
1439  // For correct UVM use, make the "host memory space" (template
1440  // parameter of sync and modify) different than the "device memory
1441  // space." Otherwise, sync() won't fence (indeed, it won't do
1442  // anything).
1443  typedef Kokkos::HostSpace HMS;
1444  const char tfecfFuncName[] = "unpackAndCombineNew: ";
1445  ProfilingRegion regionUAC ("Tpetra::MultiVector::unpackAndCombine");
1446 
1447  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1448  // have the option to check indices. We do so when Tpetra is in
1449  // debug mode. It is in debug mode by default in a debug build,
1450  // but you may control this at run time, before launching the
1451  // executable, by setting the TPETRA_DEBUG environment variable to
1452  // "1" (or "TRUE").
1453  const bool debugCheckIndices = Behavior::debug ();
1454 
1455  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1456  // environment variable to "1" (or "TRUE") for copious debug
1457  // output to std::cerr on every MPI process. This is unwise for
1458  // runs with large numbers of MPI processes.
1459  const bool printDebugOutput = Behavior::verbose ();
1460  int myRank = 0;
1461  if (printDebugOutput && ! this->getMap ().is_null () &&
1462  ! this->getMap ()->getComm ().is_null ()) {
1463  myRank = this->getMap ()->getComm ()->getRank ();
1464  }
1465 
1466  // If we have no imports, there is nothing to do
1467  if (importLIDs.extent (0) == 0) {
1468  return;
1469  }
1470 
1471  const size_t numVecs = getNumVectors ();
1472  if (debugCheckIndices) {
1473  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1474  (static_cast<size_t> (imports.extent (0)) !=
1475  numVecs * importLIDs.extent (0),
1476  std::runtime_error,
1477  "imports.extent(0) = " << imports.extent (0)
1478  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1479  << " * " << importLIDs.extent (0) << " = "
1480  << numVecs * importLIDs.extent (0) << ".");
1481 
1482  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1483  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1484  "constantNumPackets input argument must be nonzero.");
1485 
1486  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1487  (static_cast<size_t> (numVecs) !=
1488  static_cast<size_t> (constantNumPackets),
1489  std::runtime_error, "constantNumPackets must equal numVecs.");
1490  }
1491 
1492  // mfh 12 Apr 2016: Decide where to unpack based on the memory
1493  // space in which the imports buffer was last modified.
1494  // DistObject::doTransferNew gets to decide this. We currently
1495  // require importLIDs to match (its most recent version must be in
1496  // the same memory space as imports' most recent version).
1497  const bool unpackOnHost =
1498  imports.modified_host () != 0 &&
1499  imports.modified_host () > imports.modified_device ();
1500 
1501  if (printDebugOutput) {
1502  std::ostringstream os;
1503  os << "(Proc " << myRank << ") MV::unpackAndCombine: unpackOnHost: "
1504  << (unpackOnHost ? "true" : "false") << std::endl;
1505  std::cerr << os.str ();
1506  }
1507 
1508  // Kokkos::DualView<const T*, ...> forbids sync, so "cast away
1509  // const." It still makes sense for the importLIDs to be const,
1510  // because this method does not _modify_ them. We just need them
1511  // to be in the right place.
1512  auto theImportLIDs = castAwayConstDualView (importLIDs);
1513  if (unpackOnHost && importLIDs.modified_host () < importLIDs.modified_device ()) {
1514  if (printDebugOutput) {
1515  std::ostringstream os;
1516  os << "(Proc " << myRank << ") MV::unpackAndCombine: sync importLIDs "
1517  "to host" << std::endl;
1518  std::cerr << os.str ();
1519  }
1520  // 'imports' was last modified on host, but importLIDs was last
1521  // modified on device. Sync importLIDs to host and do the work
1522  // there, since imports usually at least as much data (and often
1523  // has more) than importLIDs.
1524  theImportLIDs.template sync<HMS> ();
1525  }
1526  else if (! unpackOnHost && importLIDs.modified_host () > importLIDs.modified_device ()) {
1527  if (printDebugOutput) {
1528  std::ostringstream os;
1529  os << "(Proc " << myRank << ") MV::unpackAndCombine: sync importLIDs "
1530  "to device" << std::endl;
1531  std::cerr << os.str ();
1532  }
1533  // 'imports' was last modified on device, but importLIDs was
1534  // last modified on host. Sync importLIDs to device and do the
1535  // work there, since imports usually at least as much data (and
1536  // often has more) than importLIDs.
1537  theImportLIDs.template sync<DMS> ();
1538  }
1539  else if (printDebugOutput) {
1540  std::ostringstream os;
1541  os << "(Proc " << myRank << ") MV::unpackAndCombine: no need to sync "
1542  "importLIDs" << std::endl;
1543  std::cerr << os.str ();
1544  }
1545 
1546  // We have to sync before modifying, because this method may read
1547  // as well as write (depending on the CombineMode). This matters
1548  // because copyAndPermute may have modified *this in the other
1549  // memory space.
1550  if (unpackOnHost) {
1551  if(this->template need_sync<HMS> ()) this->template sync<HMS> ();
1552  this->template modify<HMS> ();
1553  }
1554  else { // unpack on device
1555  if(this->template need_sync<DMS> ()) this->template sync<DMS> ();
1556  this->template modify<DMS> ();
1557  }
1558  auto X_d = this->template getLocalView<DMS> ();
1559  auto X_h = this->template getLocalView<HMS> ();
1560  // 'imports' may have a different device memory space (see #1088).
1561  auto imports_d =
1562  imports.template view<typename buffer_device_type::memory_space> ();
1563  auto imports_h = imports.template view<Kokkos::HostSpace> ();
1564  auto importLIDs_d = importLIDs.template view<DMS> ();
1565  auto importLIDs_h = importLIDs.template view<HMS> ();
1566 
1567  Kokkos::DualView<size_t*, device_type> whichVecs;
1568  if (! isConstantStride ()) {
1569  Kokkos::View<const size_t*, HMS,
1570  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1571  numVecs);
1572  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1573  if (unpackOnHost) {
1574  whichVecs.template modify<HMS> ();
1575  Kokkos::deep_copy (whichVecs.template view<HMS> (), whichVecsIn);
1576  }
1577  else {
1578  whichVecs.template modify<DMS> ();
1579  Kokkos::deep_copy (whichVecs.template view<DMS> (), whichVecsIn);
1580  }
1581  }
1582  auto whichVecs_d = whichVecs.template view<DMS> ();
1583  auto whichVecs_h = whichVecs.template view<HMS> ();
1584 
1585  /* The layout in the export for MultiVectors is as follows:
1586  imports = { all of the data from row exportLIDs.front() ;
1587  ....
1588  all of the data from row exportLIDs.back() }
1589  This doesn't have the best locality, but is necessary because
1590  the data for a Packet (all data associated with an LID) is
1591  required to be contiguous. */
1592 
1593  if (printDebugOutput) {
1594  std::ostringstream os;
1595  os << "(Proc " << myRank << ") MV::unpackAndCombine: unpacking"
1596  << std::endl;
1597  std::cerr << os.str ();
1598  }
1599 
1600  if (numVecs > 0 && importLIDs.extent (0) > 0) {
1601  typedef typename Kokkos::DualView<IST*,
1602  device_type>::t_dev::execution_space dev_exec_space;
1603  typedef typename Kokkos::DualView<IST*,
1604  device_type>::t_host::execution_space host_exec_space;
1605 
1606  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1607  // custom combine modes, start editing here. Also, if you trust
1608  // inlining, it would be nice to condense this code by using a
1609  // binary function object f in the pack functors.
1610  if (CM == INSERT || CM == REPLACE) {
1611  KokkosRefactor::Details::InsertOp<execution_space> op;
1612 
1613  if (isConstantStride ()) {
1614  if (unpackOnHost) {
1615  // FIXME (mfh 29 Aug 2017) Problem: X_h's execution space
1616  // controls where unpacking takes place, but the
1617  // HostMirror of a CudaUVMSpace View has execution_space
1618  // Cuda! This is bad if imports_h is actually a host
1619  // View. This means that we need to control the execution
1620  // space in which unpack_array_multi_column works.
1621  unpack_array_multi_column (host_exec_space (),
1622  X_h, imports_h, importLIDs_h, op,
1623  numVecs, debugCheckIndices);
1624 
1625  }
1626  else { // unpack on device
1627  unpack_array_multi_column (dev_exec_space (),
1628  X_d, imports_d, importLIDs_d, op,
1629  numVecs, debugCheckIndices);
1630  }
1631  }
1632  else { // not constant stride
1633  if (unpackOnHost) {
1634  // FIXME (mfh 29 Aug 2017) Problem: X_h's execution space
1635  // controls where unpacking takes place, but the
1636  // HostMirror of a CudaUVMSpace View has execution_space
1637  // Cuda! This is bad if imports_h is actually a host
1638  // View. This means that we need to control the execution
1639  // space in which
1640  // unpack_array_multi_column_variable_stride works.
1641  unpack_array_multi_column_variable_stride (host_exec_space (),
1642  X_h, imports_h,
1643  importLIDs_h,
1644  whichVecs_h, op,
1645  numVecs,
1646  debugCheckIndices);
1647  }
1648  else { // unpack on device
1649  unpack_array_multi_column_variable_stride (dev_exec_space (),
1650  X_d, imports_d,
1651  importLIDs_d,
1652  whichVecs_d, op,
1653  numVecs,
1654  debugCheckIndices);
1655  }
1656  }
1657  }
1658  else if (CM == ADD) {
1659  KokkosRefactor::Details::AddOp<execution_space> op;
1660 
1661  if (isConstantStride ()) {
1662  if (unpackOnHost) {
1663  unpack_array_multi_column (host_exec_space (),
1664  X_h, imports_h, importLIDs_h, op,
1665  numVecs, debugCheckIndices);
1666  }
1667  else { // unpack on device
1668  unpack_array_multi_column (dev_exec_space (),
1669  X_d, imports_d, importLIDs_d, op,
1670  numVecs, debugCheckIndices);
1671  }
1672  }
1673  else { // not constant stride
1674  if (unpackOnHost) {
1675  unpack_array_multi_column_variable_stride (host_exec_space (),
1676  X_h, imports_h,
1677  importLIDs_h,
1678  whichVecs_h, op,
1679  numVecs,
1680  debugCheckIndices);
1681  }
1682  else { // unpack on device
1683  unpack_array_multi_column_variable_stride (dev_exec_space (),
1684  X_d, imports_d,
1685  importLIDs_d,
1686  whichVecs_d, op,
1687  numVecs,
1688  debugCheckIndices);
1689  }
1690  }
1691  }
1692  else if (CM == ABSMAX) {
1693  KokkosRefactor::Details::AbsMaxOp<execution_space> op;
1694 
1695  if (isConstantStride ()) {
1696  if (unpackOnHost) {
1697  unpack_array_multi_column (host_exec_space (),
1698  X_h, imports_h, importLIDs_h, op,
1699  numVecs, debugCheckIndices);
1700  }
1701  else { // unpack on device
1702  unpack_array_multi_column (dev_exec_space (),
1703  X_d, imports_d, importLIDs_d, op,
1704  numVecs, debugCheckIndices);
1705  }
1706  }
1707  else {
1708  if (unpackOnHost) {
1709  unpack_array_multi_column_variable_stride (host_exec_space (),
1710  X_h, imports_h,
1711  importLIDs_h,
1712  whichVecs_h, op,
1713  numVecs,
1714  debugCheckIndices);
1715  }
1716  else { // unpack on device
1717  unpack_array_multi_column_variable_stride (dev_exec_space (),
1718  X_d, imports_d,
1719  importLIDs_d,
1720  whichVecs_d, op,
1721  numVecs,
1722  debugCheckIndices);
1723  }
1724  }
1725  }
1726  }
1727 
1728  if (printDebugOutput) {
1729  std::ostringstream os;
1730  os << "(Proc " << myRank << ") MV::unpackAndCombine: Done!"
1731  << std::endl;
1732  std::cerr << os.str ();
1733  }
1734  }
1735 
1736  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1737  size_t
1738  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1739  getNumVectors () const
1740  {
1741  if (isConstantStride ()) {
1742  return static_cast<size_t> (view_.extent (1));
1743  } else {
1744  return static_cast<size_t> (whichVectors_.size ());
1745  }
1746  }
1747 
1748  namespace { // (anonymous)
1749 
1750  template<class RV>
1751  void
1752  gblDotImpl (const RV& dotsOut,
1753  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1754  const bool distributed)
1755  {
1756  using Teuchos::REDUCE_MAX;
1757  using Teuchos::REDUCE_SUM;
1758  using Teuchos::reduceAll;
1759  typedef typename RV::non_const_value_type dot_type;
1760 
1761  const size_t numVecs = dotsOut.extent (0);
1762 
1763  // If the MultiVector is distributed over multiple processes, do
1764  // the distributed (interprocess) part of the dot product. We
1765  // assume that the MPI implementation can read from and write to
1766  // device memory.
1767  //
1768  // replaceMap() may have removed some processes. Those
1769  // processes have a null Map. They must not participate in any
1770  // collective operations. We ask first whether the Map is null,
1771  // because isDistributed() defers that question to the Map. We
1772  // still compute and return local dots for processes not
1773  // participating in collective operations; those probably don't
1774  // make any sense, but it doesn't hurt to do them, since it's
1775  // illegal to call dot() on those processes anyway.
1776  if (distributed && ! comm.is_null ()) {
1777  // The calling process only participates in the collective if
1778  // both the Map and its Comm on that process are nonnull.
1779  const int nv = static_cast<int> (numVecs);
1780  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
1781 
1782  if (commIsInterComm) {
1783  // If comm is an intercomm, then we may not alias input and
1784  // output buffers, so we have to make a copy of the local
1785  // sum.
1786  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1787  Kokkos::deep_copy (lclDots, dotsOut);
1788  const dot_type* const lclSum = lclDots.data ();
1789  dot_type* const gblSum = dotsOut.data ();
1790  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1791  }
1792  else {
1793  dot_type* const inout = dotsOut.data ();
1794  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
1795  }
1796  }
1797  }
1798  } // namespace (anonymous)
1799 
1800  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1801  void
1802  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1804  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
1805  {
1806  using ::Tpetra::Details::Behavior;
1807  using Kokkos::create_mirror_view;
1808  using Kokkos::subview;
1809  using Teuchos::Comm;
1810  using Teuchos::null;
1811  using Teuchos::RCP;
1812  // View of all the dot product results.
1813  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
1815  typedef typename dual_view_type::t_dev XMV;
1816  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
1817  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1818 
1819  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
1820 
1821  const size_t numVecs = this->getNumVectors ();
1822  if (numVecs == 0) {
1823  return; // nothing to do
1824  }
1825  const size_t lclNumRows = this->getLocalLength ();
1826  const size_t numDots = static_cast<size_t> (dots.extent (0));
1827  const bool debug = Behavior::debug ();
1828 
1829  if (debug) {
1830  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1831  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1832  (! compat, std::invalid_argument, "'*this' MultiVector is not "
1833  "compatible with the input MultiVector A. We only test for this "
1834  "in debug mode.");
1835  }
1836 
1837  // FIXME (mfh 11 Jul 2014) These exception tests may not
1838  // necessarily be thrown on all processes consistently. We should
1839  // instead pass along error state with the inner product. We
1840  // could do this by setting an extra slot to
1841  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1842  // final sum should be
1843  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1844  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1845  lclNumRows != A.getLocalLength (), std::runtime_error,
1846  "MultiVectors do not have the same local length. "
1847  "this->getLocalLength() = " << lclNumRows << " != "
1848  "A.getLocalLength() = " << A.getLocalLength () << ".");
1849  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1850  numVecs != A.getNumVectors (), std::runtime_error,
1851  "MultiVectors must have the same number of columns (vectors). "
1852  "this->getNumVectors() = " << numVecs << " != "
1853  "A.getNumVectors() = " << A.getNumVectors () << ".");
1854  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1855  numDots != numVecs, std::runtime_error,
1856  "The output array 'dots' must have the same number of entries as the "
1857  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
1858  numDots << " != this->getNumVectors() = " << numVecs << ".");
1859 
1860  const std::pair<size_t, size_t> colRng (0, numVecs);
1861  RV dotsOut = subview (dots, colRng);
1862  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1863  this->getMap ()->getComm ();
1864 
1865  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1866  if (this->template need_sync<dev_memory_space> ()) const_cast<MV*>(this)->template sync<dev_memory_space> ();
1867  if (A.template need_sync<dev_memory_space> ()) const_cast<MV&>(A).template sync<dev_memory_space> ();
1868 
1869  auto thisView = this->template getLocalView<dev_memory_space> ();
1870  auto A_view = A.template getLocalView<dev_memory_space> ();
1871 
1872  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1873  this->whichVectors_.getRawPtr (),
1874  A.whichVectors_.getRawPtr (),
1875  this->isConstantStride (), A.isConstantStride ());
1876  gblDotImpl (dotsOut, comm, this->isDistributed ());
1877  }
1878 
1879  namespace { // (anonymous)
1880  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1882  multiVectorSingleColumnDot (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
1884  {
1885  using Teuchos::Comm;
1886  using Teuchos::RCP;
1888  typedef typename MV::dot_type dot_type;
1889  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
1890  ::Tpetra::Details::ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
1891 
1892  RCP<const typename MV::map_type> map = x.getMap ();
1893  RCP<const Comm<int> > comm = map.is_null () ? Teuchos::null : map->getComm ();
1894  if (comm.is_null ()) {
1895  return Kokkos::ArithTraits<dot_type>::zero ();
1896  }
1897  else {
1898  typedef LocalOrdinal LO;
1899  // The min just ensures that we don't overwrite memory that
1900  // doesn't belong to us, in the erroneous input case where x
1901  // and y have different numbers of rows.
1902  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
1903  y.getLocalLength ()));
1904  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
1905  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
1906  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
1907 
1908  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1909  if (x.template need_sync<dev_memory_space> ()) x.template sync<dev_memory_space> ();
1910  if (y.template need_sync<dev_memory_space> ()) const_cast<MV&>(y).template sync<dev_memory_space> ();
1911 
1912  x.template modify<dev_memory_space> ();
1913  auto x_2d = x.template getLocalView<dev_memory_space> ();
1914  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
1915  auto y_2d = y.template getLocalView<dev_memory_space> ();
1916  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
1917  lclDot = KokkosBlas::dot (x_1d, y_1d);
1918 
1919  if (x.isDistributed ()) {
1920  using Teuchos::outArg;
1921  using Teuchos::REDUCE_SUM;
1922  using Teuchos::reduceAll;
1923  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
1924  }
1925  else {
1926  gblDot = lclDot;
1927  }
1928  return gblDot;
1929  }
1930  }
1931  } // namespace (anonymous)
1932 
1933 
1934 
1935  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1936  void
1937  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1939  const Teuchos::ArrayView<dot_type>& dots) const
1940  {
1942  const char tfecfFuncName[] = "dot: ";
1943  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
1944 
1945  const size_t numVecs = this->getNumVectors ();
1946  const size_t lclNumRows = this->getLocalLength ();
1947  const size_t numDots = static_cast<size_t> (dots.size ());
1948 
1949  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
1950  // not necessarily be thrown on all processes consistently. We
1951  // keep them for now, because MultiVector's unit tests insist on
1952  // them. In the future, we should instead pass along error state
1953  // with the inner product. We could do this by setting an extra
1954  // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
1955  // The final sum should be
1956  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1957  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1958  (lclNumRows != A.getLocalLength (), std::runtime_error,
1959  "MultiVectors do not have the same local length. "
1960  "this->getLocalLength() = " << lclNumRows << " != "
1961  "A.getLocalLength() = " << A.getLocalLength () << ".");
1962  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1963  (numVecs != A.getNumVectors (), std::runtime_error,
1964  "MultiVectors must have the same number of columns (vectors). "
1965  "this->getNumVectors() = " << numVecs << " != "
1966  "A.getNumVectors() = " << A.getNumVectors () << ".");
1967  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1968  (numDots != numVecs, std::runtime_error,
1969  "The output array 'dots' must have the same number of entries as the "
1970  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
1971  numDots << " != this->getNumVectors() = " << numVecs << ".");
1972 
1973  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
1974  const dot_type gblDot = multiVectorSingleColumnDot (const_cast<MV&> (*this), A);
1975  dots[0] = gblDot;
1976  }
1977  else {
1978  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
1979  }
1980  }
1981 
1982 
1983  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1984  void
1986  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
1987  {
1988  typedef Kokkos::View<mag_type*, Kokkos::HostSpace> host_norms_view_type;
1989 
1990  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::norm2 (Teuchos::ArrayView)");
1991 
1992  const size_t numNorms = static_cast<size_t> (norms.size ());
1993  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1994  this->norm2 (normsHostView); // Do the computation on the device.
1995  }
1996 
1997 
1998  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1999  void
2001  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2002  {
2003  this->normImpl (norms, NORM_TWO);
2004  }
2005 
2006 
2007  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2008  void TPETRA_DEPRECATED
2011  const Teuchos::ArrayView<mag_type> &norms) const
2012  {
2013  using Kokkos::ALL;
2014  using Kokkos::subview;
2015  using Teuchos::Comm;
2016  using Teuchos::null;
2017  using Teuchos::RCP;
2018  using Teuchos::reduceAll;
2019  using Teuchos::REDUCE_SUM;
2020  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2021  typedef Kokkos::Details::ArithTraits<mag_type> ATM;
2022  typedef Kokkos::View<mag_type*, Kokkos::HostSpace> norms_view_type;
2024  const char tfecfFuncName[] = "normWeighted: ";
2025 
2026  const size_t numVecs = this->getNumVectors ();
2027  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2028  static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
2029  "norms.size() = " << norms.size () << " != this->getNumVectors() = "
2030  << numVecs << ".");
2031 
2032  const bool OneW = (weights.getNumVectors () == 1);
2033  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2034  ! OneW && weights.getNumVectors () != numVecs, std::runtime_error,
2035  "The input MultiVector of weights must contain either one column, "
2036  "or must have the same number of columns as *this. "
2037  "weights.getNumVectors() = " << weights.getNumVectors ()
2038  << " and this->getNumVectors() = " << numVecs << ".");
2039 
2040  const bool debug = ::Tpetra::Details::Behavior::debug ();
2041 
2042  if (debug) {
2043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2044  (! this->getMap ()->isCompatible (*weights.getMap ()), std::runtime_error,
2045  "MultiVectors do not have compatible Maps:" << std::endl
2046  << "this->getMap(): " << std::endl << *this->getMap()
2047  << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
2048  }
2049  else {
2050  const size_t lclNumRows = this->getLocalLength ();
2051  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2052  (lclNumRows != weights.getLocalLength (), std::runtime_error,
2053  "MultiVectors do not have the same local length.");
2054  }
2055 
2056  norms_view_type lclNrms ("Tpetra::MV::lclNrms", numVecs);
2057 
2058  // FIXME (mfh 18 May 2016) Yes, I know "const" is a lie.
2059  const_cast<MV*> (this)->template sync<device_type> ();
2060  const_cast<MV&> (weights).template sync<device_type> ();
2061 
2062  auto X_lcl = this->template getLocalView<device_type> ();
2063  auto W_lcl = weights.template getLocalView<device_type> ();
2064 
2065  if (isConstantStride () && ! OneW) {
2066  KokkosBlas::nrm2w_squared (lclNrms, X_lcl, W_lcl);
2067  }
2068  else {
2069  for (size_t j = 0; j < numVecs; ++j) {
2070  const size_t X_col = this->isConstantStride () ? j :
2071  this->whichVectors_[j];
2072  const size_t W_col = OneW ? static_cast<size_t> (0) :
2073  (weights.isConstantStride () ? j : weights.whichVectors_[j]);
2074  KokkosBlas::nrm2w_squared (subview (lclNrms, j),
2075  subview (X_lcl, ALL (), X_col),
2076  subview (W_lcl, ALL (), W_col));
2077  }
2078  }
2079 
2080  const mag_type OneOverN =
2081  ATM::one () / static_cast<mag_type> (this->getGlobalLength ());
2082  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
2083  Teuchos::null : this->getMap ()->getComm ();
2084 
2085  if (! comm.is_null () && this->isDistributed ()) {
2086  // Assume that MPI can access device memory.
2087  reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2088  lclNrms.data (), norms.getRawPtr ());
2089  for (size_t k = 0; k < numVecs; ++k) {
2090  norms[k] = ATM::sqrt (norms[k] * OneOverN);
2091  }
2092  }
2093  else {
2094  for (size_t k = 0; k < numVecs; ++k) {
2095  norms[k] = ATM::sqrt (ATS::magnitude (lclNrms(k)) * OneOverN);
2096  }
2097  }
2098  }
2099 
2100 
2101  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2102  void
2104  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2105  {
2106  typedef typename Kokkos::View<mag_type*, Kokkos::HostSpace> host_norms_view_type;
2107 
2108  const size_t numNorms = static_cast<size_t> (norms.size ());
2109  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
2110  this->norm1 (normsHostView); // Do the computation on the device.
2111  }
2112 
2113 
2114  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2115  void
2117  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2118  {
2119  this->normImpl (norms, NORM_ONE);
2120  }
2121 
2122  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2123  void
2125  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2126  {
2127  typedef Kokkos::View<mag_type*, Kokkos::HostSpace> host_norms_view_type;
2128 
2129  const size_t numNorms = static_cast<size_t> (norms.size ());
2130  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
2131  this->normInf (normsHostView); // Do the computation on the device.
2132  }
2133 
2134 
2135  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2136  void
2138  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2139  {
2140  this->normImpl (norms, NORM_INF);
2141  }
2142 
2143  namespace { // (anonymous)
2144 
2147  IMPL_NORM_ONE, //<! Use the one-norm
2148  IMPL_NORM_TWO, //<! Use the two-norm
2149  IMPL_NORM_INF //<! Use the infinity-norm
2150  };
2151 
2152  template<class RV, class XMV>
2153  void
2154  lclNormImpl (const RV& normsOut,
2155  const XMV& X_lcl,
2156  const size_t lclNumRows,
2157  const size_t numVecs,
2158  const Teuchos::ArrayView<const size_t>& whichVecs,
2159  const bool constantStride,
2160  const EWhichNormImpl whichNorm)
2161  {
2162  using Kokkos::ALL;
2163  using Kokkos::subview;
2164  typedef typename RV::non_const_value_type mag_type;
2165 
2166  static_assert (Kokkos::Impl::is_view<RV>::value,
2167  "Tpetra::MultiVector::lclNormImpl: "
2168  "The first argument RV is not a Kokkos::View.");
2169  static_assert (RV::rank == 1, "Tpetra::MultiVector::lclNormImpl: "
2170  "The first argument normsOut must have rank 1.");
2171  static_assert (Kokkos::Impl::is_view<XMV>::value,
2172  "Tpetra::MultiVector::lclNormImpl: "
2173  "The second argument X_lcl is not a Kokkos::View.");
2174  static_assert (XMV::rank == 2, "Tpetra::MultiVector::lclNormImpl: "
2175  "The second argument X_lcl must have rank 2.");
2176 
2177  // In case the input dimensions don't match, make sure that we
2178  // don't overwrite memory that doesn't belong to us, by using
2179  // subset views with the minimum dimensions over all input.
2180  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2181  const std::pair<size_t, size_t> colRng (0, numVecs);
2182  RV theNorms = subview (normsOut, colRng);
2183  XMV X = subview (X_lcl, rowRng, Kokkos::ALL());
2184 
2185  // mfh 10 Mar 2015: Kokkos::(Dual)View subviews don't quite
2186  // behave how you think when they have zero rows. In that case,
2187  // it returns a 0 x 0 (Dual)View.
2188  TEUCHOS_TEST_FOR_EXCEPTION(
2189  lclNumRows != 0 && constantStride && ( \
2190  ( X.extent (0) != lclNumRows ) ||
2191  ( X.extent (1) != numVecs ) ),
2192  std::logic_error, "Constant Stride X's dimensions are " << X.extent (0) << " x "
2193  << X.extent (1) << ", which differ from the local dimensions "
2194  << lclNumRows << " x " << numVecs << ". Please report this bug to "
2195  "the Tpetra developers.");
2196 
2197  TEUCHOS_TEST_FOR_EXCEPTION(
2198  lclNumRows != 0 && !constantStride && ( \
2199  ( X.extent (0) != lclNumRows ) ||
2200  ( X.extent (1) < numVecs ) ),
2201  std::logic_error, "Strided X's dimensions are " << X.extent (0) << " x "
2202  << X.extent (1) << ", which are incompatible with the local dimensions "
2203  << lclNumRows << " x " << numVecs << ". Please report this bug to "
2204  "the Tpetra developers.");
2205 
2206  if (lclNumRows == 0) {
2207  const mag_type zeroMag = Kokkos::Details::ArithTraits<mag_type>::zero ();
2208  Kokkos::deep_copy(theNorms, zeroMag);
2209  }
2210  else { // lclNumRows != 0
2211  if (constantStride) {
2212  if (whichNorm == IMPL_NORM_INF) {
2213  KokkosBlas::nrminf (theNorms, X);
2214  }
2215  else if (whichNorm == IMPL_NORM_ONE) {
2216  KokkosBlas::nrm1 (theNorms, X);
2217  }
2218  else if (whichNorm == IMPL_NORM_TWO) {
2219  KokkosBlas::nrm2_squared (theNorms, X);
2220  }
2221  else {
2222  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
2223  }
2224  }
2225  else { // not constant stride
2226  // NOTE (mfh 15 Jul 2014) This does a kernel launch for
2227  // every column. It might be better to have a kernel that
2228  // does the work all at once. On the other hand, we don't
2229  // prioritize performance of MultiVector views of
2230  // noncontiguous columns.
2231  for (size_t k = 0; k < numVecs; ++k) {
2232  const size_t X_col = constantStride ? k : whichVecs[k];
2233  if (whichNorm == IMPL_NORM_INF) {
2234  KokkosBlas::nrminf (subview (theNorms, k),
2235  subview (X, ALL (), X_col));
2236  }
2237  else if (whichNorm == IMPL_NORM_ONE) {
2238  KokkosBlas::nrm1 (subview (theNorms, k),
2239  subview (X, ALL (), X_col));
2240  }
2241  else if (whichNorm == IMPL_NORM_TWO) {
2242  KokkosBlas::nrm2_squared (subview (theNorms, k),
2243  subview (X, ALL (), X_col));
2244  }
2245  else {
2246  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
2247  }
2248  } // for each column
2249  } // constantStride
2250  } // lclNumRows != 0
2251  }
2252 
2253  // Kokkos::parallel_for functor for applying square root to each
2254  // entry of a 1-D Kokkos::View.
2255  template<class ViewType>
2256  class SquareRootFunctor {
2257  public:
2258  typedef typename ViewType::execution_space execution_space;
2259  typedef typename ViewType::size_type size_type;
2260 
2261  SquareRootFunctor (const ViewType& theView) :
2262  theView_ (theView)
2263  {}
2264 
2265  KOKKOS_INLINE_FUNCTION void
2266  operator() (const size_type& i) const
2267  {
2268  typedef typename ViewType::non_const_value_type value_type;
2269  typedef Kokkos::Details::ArithTraits<value_type> KAT;
2270  theView_(i) = KAT::sqrt (theView_(i));
2271  }
2272 
2273  private:
2274  ViewType theView_;
2275  };
2276 
2277  template<class RV>
2278  void
2279  gblNormImpl (const RV& normsOut,
2280  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2281  const bool distributed,
2282  const EWhichNormImpl whichNorm)
2283  {
2284  using Teuchos::REDUCE_MAX;
2285  using Teuchos::REDUCE_SUM;
2286  using Teuchos::reduceAll;
2287  typedef typename RV::non_const_value_type mag_type;
2288 
2289  const size_t numVecs = normsOut.extent (0);
2290 
2291  // If the MultiVector is distributed over multiple processes, do
2292  // the distributed (interprocess) part of the norm. We assume
2293  // that the MPI implementation can read from and write to device
2294  // memory.
2295  //
2296  // replaceMap() may have removed some processes. Those processes
2297  // have a null Map. They must not participate in any collective
2298  // operations. We ask first whether the Map is null, because
2299  // isDistributed() defers that question to the Map. We still
2300  // compute and return local norms for processes not participating
2301  // in collective operations; those probably don't make any sense,
2302  // but it doesn't hurt to do them, since it's illegal to call
2303  // norm*() on those processes anyway.
2304  if (distributed && ! comm.is_null ()) {
2305  // The calling process only participates in the collective if
2306  // both the Map and its Comm on that process are nonnull.
2307  //
2308  // MPI doesn't allow aliasing of arguments, so we have to make
2309  // a copy of the local sum.
2310  RV lclNorms ("MV::normImpl lcl", numVecs);
2311  Kokkos::deep_copy (lclNorms, normsOut);
2312  const mag_type* const lclSum = lclNorms.data ();
2313  mag_type* const gblSum = normsOut.data ();
2314  const int nv = static_cast<int> (numVecs);
2315  if (whichNorm == IMPL_NORM_INF) {
2316  reduceAll<int, mag_type> (*comm, REDUCE_MAX, nv, lclSum, gblSum);
2317  } else {
2318  reduceAll<int, mag_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2319  }
2320  }
2321 
2322  if (whichNorm == IMPL_NORM_TWO) {
2323  // Replace the norm-squared results with their square roots in
2324  // place, to get the final output. If the device memory and
2325  // the host memory are the same, it probably doesn't pay to
2326  // launch a parallel kernel for that, since there isn't enough
2327  // parallelism for the typical MultiVector case.
2328  const bool inHostMemory =
2329  Kokkos::Impl::is_same<typename RV::memory_space,
2330  typename RV::host_mirror_space::memory_space>::value;
2331  if (inHostMemory) {
2332  for (size_t j = 0; j < numVecs; ++j) {
2333  normsOut(j) = Kokkos::Details::ArithTraits<mag_type>::sqrt (normsOut(j));
2334  }
2335  }
2336  else {
2337  // There's not as much parallelism now, but that's OK. The
2338  // point of doing parallel dispatch here is to keep the norm
2339  // results on the device, thus avoiding a copy to the host
2340  // and back again.
2341  SquareRootFunctor<RV> f (normsOut);
2342  typedef typename RV::execution_space execution_space;
2343  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2344  Kokkos::parallel_for (range_type (0, numVecs), f);
2345  }
2346  }
2347  }
2348 
2349  } // namespace (anonymous)
2350 
2351  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2352  void
2353  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2354  normImpl (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms,
2355  const EWhichNorm whichNorm) const
2356  {
2357  using Kokkos::create_mirror_view;
2358  using Kokkos::subview;
2359  using Teuchos::Comm;
2360  using Teuchos::null;
2361  using Teuchos::RCP;
2362  // View of all the norm results.
2363  typedef Kokkos::View<mag_type*, Kokkos::HostSpace> RV;
2364 
2365  const size_t numVecs = this->getNumVectors ();
2366  if (numVecs == 0) {
2367  return; // nothing to do
2368  }
2369  const size_t lclNumRows = this->getLocalLength ();
2370  const size_t numNorms = static_cast<size_t> (norms.extent (0));
2371  TEUCHOS_TEST_FOR_EXCEPTION(
2372  numNorms < numVecs, std::runtime_error, "Tpetra::MultiVector::normImpl: "
2373  "'norms' must have at least as many entries as the number of vectors in "
2374  "*this. norms.extent(0) = " << numNorms << " < this->getNumVectors()"
2375  " = " << numVecs << ".");
2376 
2377  const std::pair<size_t, size_t> colRng (0, numVecs);
2378  RV normsOut = subview (norms, colRng);
2379 
2380  EWhichNormImpl lclNormType;
2381  if (whichNorm == NORM_ONE) {
2382  lclNormType = IMPL_NORM_ONE;
2383  } else if (whichNorm == NORM_TWO) {
2384  lclNormType = IMPL_NORM_TWO;
2385  } else {
2386  lclNormType = IMPL_NORM_INF;
2387  }
2388 
2389  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2390  this->getMap ()->getComm ();
2391 
2392  // If we need sync to device, then host has the most recent version.
2393  const bool useHostVersion = this->template need_sync<device_type> ();
2394  if (useHostVersion) {
2395  // DualView was last modified on host, so run the local kernel there.
2396  // This means we need a host mirror of the array of norms too.
2397  typedef typename dual_view_type::t_host XMV;
2398  typedef typename XMV::memory_space cur_memory_space;
2399 
2400  auto thisView = this->template getLocalView<cur_memory_space> ();
2401  lclNormImpl<RV, XMV> (normsOut, thisView, lclNumRows, numVecs,
2402  this->whichVectors_, this->isConstantStride (),
2403  lclNormType);
2404  gblNormImpl (normsOut, comm, this->isDistributed (), lclNormType);
2405  }
2406  else {
2407  // DualView was last modified on device, so run the local kernel there.
2408  typedef typename dual_view_type::t_dev XMV;
2409  typedef typename XMV::memory_space cur_memory_space;
2410 
2411  auto thisView = this->template getLocalView<cur_memory_space> ();
2412  lclNormImpl<RV, XMV> (normsOut, thisView, lclNumRows, numVecs,
2413  this->whichVectors_, this->isConstantStride (),
2414  lclNormType);
2415  gblNormImpl (normsOut, comm, this->isDistributed (), lclNormType);
2416  }
2417  }
2418 
2419  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2420  void
2422  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2423  {
2424  // KR FIXME Overload this method to take a View.
2425  using Kokkos::ALL;
2426  using Kokkos::subview;
2427  using Teuchos::Comm;
2428  using Teuchos::RCP;
2429  using Teuchos::reduceAll;
2430  using Teuchos::REDUCE_SUM;
2431  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2432 
2433  const size_t lclNumRows = this->getLocalLength ();
2434  const size_t numVecs = this->getNumVectors ();
2435  const size_t numMeans = static_cast<size_t> (means.size ());
2436 
2437  TEUCHOS_TEST_FOR_EXCEPTION(
2438  numMeans != numVecs, std::runtime_error,
2439  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2440  << " != this->getNumVectors() = " << numVecs << ".");
2441 
2442  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2443  const std::pair<size_t, size_t> colRng (0, numVecs);
2444 
2445  // Make sure that the final output view has the same layout as the
2446  // temporary view's HostMirror. Left or Right doesn't matter for
2447  // a 1-D array anyway; this is just to placate the compiler.
2448  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2449  typedef Kokkos::View<impl_scalar_type*,
2450  typename local_view_type::HostMirror::array_layout,
2451  Kokkos::HostSpace,
2452  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2453  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2454 
2455  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2456  this->getMap ()->getComm ();
2457 
2458  // If we need sync to device, then host has the most recent version.
2459  const bool useHostVersion = this->template need_sync<device_type> ();
2460  if (useHostVersion) {
2461  // DualView was last modified on host, so run the local kernel there.
2462  auto X_lcl = subview (this->template getLocalView<Kokkos::HostSpace> (),
2463  rowRng, Kokkos::ALL ());
2464  // Compute the local sum of each column.
2465  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2466  if (isConstantStride ()) {
2467  KokkosBlas::sum (lclSums, X_lcl);
2468  }
2469  else {
2470  for (size_t j = 0; j < numVecs; ++j) {
2471  const size_t col = whichVectors_[j];
2472  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2473  }
2474  }
2475 
2476  // If there are multiple MPI processes, the all-reduce reads
2477  // from lclSums, and writes to meansOut. Otherwise, we just
2478  // copy lclSums into meansOut.
2479  if (! comm.is_null () && this->isDistributed ()) {
2480  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2481  lclSums.data (), meansOut.data ());
2482  }
2483  else {
2484  Kokkos::deep_copy (meansOut, lclSums);
2485  }
2486  }
2487  else {
2488  // DualView was last modified on device, so run the local kernel there.
2489  auto X_lcl = subview (this->template getLocalView<device_type> (),
2490  rowRng, Kokkos::ALL ());
2491 
2492  // Compute the local sum of each column.
2493  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2494  if (isConstantStride ()) {
2495  KokkosBlas::sum (lclSums, X_lcl);
2496  }
2497  else {
2498  for (size_t j = 0; j < numVecs; ++j) {
2499  const size_t col = whichVectors_[j];
2500  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2501  }
2502  }
2503 
2504  // If there are multiple MPI processes, the all-reduce reads
2505  // from lclSums, and writes to meansOut. (We assume that MPI
2506  // can read device memory.) Otherwise, we just copy lclSums
2507  // into meansOut.
2508  if (! comm.is_null () && this->isDistributed ()) {
2509  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2510  lclSums.data (), meansOut.data ());
2511  }
2512  else {
2513  Kokkos::deep_copy (meansOut, lclSums);
2514  }
2515  }
2516 
2517  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2518  // to the magnitude type, since operator/ (std::complex<T>, int)
2519  // isn't necessarily defined.
2520  const impl_scalar_type OneOverN =
2521  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2522  for (size_t k = 0; k < numMeans; ++k) {
2523  meansOut(k) = meansOut(k) * OneOverN;
2524  }
2525  }
2526 
2527 
2528  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2529  void
2532  {
2533  typedef impl_scalar_type IST;
2534  typedef Kokkos::Details::ArithTraits<IST> ATS;
2535  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2536  typedef typename pool_type::generator_type generator_type;
2537 
2538  const IST max = Kokkos::rand<generator_type, IST>::max ();
2539  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2540 
2541  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2542  }
2543 
2544 
2545  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2546  void
2548  randomize (const Scalar& minVal, const Scalar& maxVal)
2549  {
2550  typedef impl_scalar_type IST;
2551  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2552 
2553  // Seed the pseudorandom number generator using the calling
2554  // process' rank. This helps decorrelate different process'
2555  // pseudorandom streams. It's not perfect but it's effective and
2556  // doesn't require MPI communication. The seed also includes bits
2557  // from the standard library's rand().
2558  //
2559  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2560  // The code below just makes a new seed each time.
2561 
2562  const uint64_t myRank =
2563  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2564  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2565  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2566 
2567  pool_type rand_pool (seed);
2568  const IST max = static_cast<IST> (maxVal);
2569  const IST min = static_cast<IST> (minVal);
2570 
2571  // See #1510. In case diag has already been marked modified on
2572  // host or device, we need to clear those flags, since the code
2573  // below works on device.
2574  this->view_.modified_device () = 0;
2575  this->view_.modified_host () = 0;
2576 
2577  this->template modify<device_type> ();
2578  auto thisView = this->getLocalView<device_type> ();
2579 
2580  if (isConstantStride ()) {
2581  Kokkos::fill_random (thisView, rand_pool, min, max);
2582  }
2583  else {
2584  const size_t numVecs = getNumVectors ();
2585  for (size_t k = 0; k < numVecs; ++k) {
2586  const size_t col = whichVectors_[k];
2587  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2588  Kokkos::fill_random (X_k, rand_pool, min, max);
2589  }
2590  }
2591  }
2592 
2593  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2594  void
2596  putScalar (const Scalar& alpha)
2597  {
2598  using ::Tpetra::Details::ProfilingRegion;
2599  using ::Tpetra::Details::Blas::fill;
2600  typedef typename dual_view_type::t_dev::memory_space DMS;
2601  //typedef typename dual_view_type::t_host::memory_space HMS;
2602  typedef Kokkos::HostSpace HMS; // avoid CudaUVMSpace issues
2603  typedef typename dual_view_type::t_dev::execution_space DES;
2604  typedef typename dual_view_type::t_host::execution_space HES;
2605  typedef LocalOrdinal LO;
2606  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2607 
2608  // We need this cast for cases like Scalar = std::complex<T> but
2609  // impl_scalar_type = Kokkos::complex<T>.
2610  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2611  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2612  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2613 
2614  // Modify the most recently updated version of the data. This
2615  // avoids sync'ing, which could violate users' expectations.
2616  //
2617  // If we need sync to device, then host has the most recent version.
2618  const bool useHostVersion = this->template need_sync<DMS> ();
2619 
2620  if (! useHostVersion) { // last modified in device memory
2621  this->template modify<DMS> (); // we are about to modify on the device
2622  auto X = this->template getLocalView<DMS> ();
2623  if (this->isConstantStride ()) {
2624  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2625  }
2626  else {
2627  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2628  this->whichVectors_.getRawPtr ());
2629  }
2630  }
2631  else { // last modified in host memory, so modify data there.
2632  this->template modify<HMS> (); // we are about to modify on the host
2633  auto X = this->template getLocalView<HMS> ();
2634  if (this->isConstantStride ()) {
2635  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2636  }
2637  else {
2638  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2639  this->whichVectors_.getRawPtr ());
2640  }
2641  }
2642  }
2643 
2644 
2645  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2646  void
2648  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2649  {
2650  using Teuchos::ArrayRCP;
2651  using Teuchos::Comm;
2652  using Teuchos::RCP;
2653 
2654  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2655  // it might work if the MV is a column view of another MV.
2656  // However, things might go wrong when restoring the original
2657  // Map, so we don't allow this case for now.
2658  TEUCHOS_TEST_FOR_EXCEPTION(
2659  ! this->isConstantStride (), std::logic_error,
2660  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2661  "if the MultiVector is a column view of another MultiVector (that is, if "
2662  "isConstantStride() == false).");
2663 
2664  // Case 1: current Map and new Map are both nonnull on this process.
2665  // Case 2: current Map is nonnull, new Map is null.
2666  // Case 3: current Map is null, new Map is nonnull.
2667  // Case 4: both Maps are null: forbidden.
2668  //
2669  // Case 1 means that we don't have to do anything on this process,
2670  // other than assign the new Map. (We always have to do that.)
2671  // It's an error for the user to supply a Map that requires
2672  // resizing in this case.
2673  //
2674  // Case 2 means that the calling process is in the current Map's
2675  // communicator, but will be excluded from the new Map's
2676  // communicator. We don't have to do anything on the calling
2677  // process; just leave whatever data it may have alone.
2678  //
2679  // Case 3 means that the calling process is excluded from the
2680  // current Map's communicator, but will be included in the new
2681  // Map's communicator. This means we need to (re)allocate the
2682  // local DualView if it does not have the right number of rows.
2683  // If the new number of rows is nonzero, we'll fill the newly
2684  // allocated local data with zeros, as befits a projection
2685  // operation.
2686  //
2687  // The typical use case for Case 3 is that the MultiVector was
2688  // first created with the Map with more processes, then that Map
2689  // was replaced with a Map with fewer processes, and finally the
2690  // original Map was restored on this call to replaceMap.
2691 
2692 #ifdef HAVE_TEUCHOS_DEBUG
2693  // mfh 28 Mar 2013: We can't check for compatibility across the
2694  // whole communicator, unless we know that the current and new
2695  // Maps are nonnull on _all_ participating processes.
2696  // TEUCHOS_TEST_FOR_EXCEPTION(
2697  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2698  // std::invalid_argument, "Tpetra::MultiVector::project: "
2699  // "If the input Map's communicator is compatible (has the same number of "
2700  // "processes as) the current Map's communicator, then the two Maps must be "
2701  // "compatible. The replaceMap() method is not for data redistribution; "
2702  // "use Import or Export for that purpose.");
2703 
2704  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2705  // of the Map, in case the process counts don't match.
2706 #endif // HAVE_TEUCHOS_DEBUG
2707 
2708  if (this->getMap ().is_null ()) { // current Map is null
2709  // If this->getMap() is null, that means that this MultiVector
2710  // has already had replaceMap happen to it. In that case, just
2711  // reallocate the DualView with the right size.
2712 
2713  TEUCHOS_TEST_FOR_EXCEPTION(
2714  newMap.is_null (), std::invalid_argument,
2715  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2716  "This probably means that the input Map is incorrect.");
2717 
2718  // Case 3: current Map is null, new Map is nonnull.
2719  // Reallocate the DualView with the right dimensions.
2720  const size_t newNumRows = newMap->getNodeNumElements ();
2721  const size_t origNumRows = view_.extent (0);
2722  const size_t numCols = this->getNumVectors ();
2723 
2724  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2725  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2726  }
2727  }
2728  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2729  // I am an excluded process. Reinitialize my data so that I
2730  // have 0 rows. Keep the number of columns as before.
2731  const size_t newNumRows = static_cast<size_t> (0);
2732  const size_t numCols = this->getNumVectors ();
2733  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2734  }
2735 
2736  this->map_ = newMap;
2737  }
2738 
2739  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2740  void
2742  scale (const Scalar& alpha)
2743  {
2744  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2745  if (theAlpha == Kokkos::Details::ArithTraits<impl_scalar_type>::one ()) {
2746  return; // do nothing
2747  }
2748  const size_t lclNumRows = this->getLocalLength ();
2749  const size_t numVecs = this->getNumVectors ();
2750  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2751  const std::pair<size_t, size_t> colRng (0, numVecs);
2752 
2753  // We can't substitute putScalar(0.0) for scale(0.0), because the
2754  // former will overwrite NaNs present in the MultiVector. The
2755  // semantics of this call require multiplying them by 0, which
2756  // IEEE 754 requires to be NaN.
2757 
2758  // If we need sync to device, then host has the most recent version.
2759  const bool useHostVersion = this->template need_sync<device_type> ();
2760  if (useHostVersion) {
2761  auto Y_lcl =
2762  Kokkos::subview (this->template getLocalView<Kokkos::HostSpace> (),
2763  rowRng, Kokkos::ALL ());
2764  if (isConstantStride ()) {
2765  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2766  }
2767  else {
2768  for (size_t k = 0; k < numVecs; ++k) {
2769  const size_t Y_col = this->isConstantStride () ? k :
2770  this->whichVectors_[k];
2771  auto Y_k = Kokkos::subview (Y_lcl, Kokkos::ALL (), Y_col);
2772  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2773  }
2774  }
2775  }
2776  else { // work on device
2777  auto Y_lcl =
2778  Kokkos::subview (this->template getLocalView<device_type> (),
2779  rowRng, Kokkos::ALL ());
2780  if (isConstantStride ()) {
2781  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2782  }
2783  else {
2784  for (size_t k = 0; k < numVecs; ++k) {
2785  const size_t Y_col = this->isConstantStride () ? k :
2786  this->whichVectors_[k];
2787  auto Y_k = Kokkos::subview (Y_lcl, Kokkos::ALL (), Y_col);
2788  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2789  }
2790  }
2791  }
2792  }
2793 
2794 
2795  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2796  void
2798  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2799  {
2800  const size_t numVecs = this->getNumVectors ();
2801  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2802  TEUCHOS_TEST_FOR_EXCEPTION(
2803  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2804  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2805  << numVecs << ".");
2806 
2807  // Use a DualView to copy the scaling constants onto the device.
2808  typedef Kokkos::DualView<impl_scalar_type*, device_type> k_alphas_type ;
2809  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2810  k_alphas.template modify<typename k_alphas_type::host_mirror_space> ();
2811  for (size_t i = 0; i < numAlphas; ++i) {
2812  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2813  }
2814  k_alphas.template sync<typename k_alphas_type::memory_space> ();
2815  // Invoke the scale() overload that takes a device View of coefficients.
2816  this->scale (k_alphas.d_view);
2817  }
2818 
2819  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2820  void
2822  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2823  {
2824  using Kokkos::ALL;
2825  using Kokkos::subview;
2826 
2827  const size_t lclNumRows = this->getLocalLength ();
2828  const size_t numVecs = this->getNumVectors ();
2829  TEUCHOS_TEST_FOR_EXCEPTION(
2830  static_cast<size_t> (alphas.extent (0)) != numVecs,
2831  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2832  "alphas.extent(0) = " << alphas.extent (0)
2833  << " != this->getNumVectors () = " << numVecs << ".");
2834  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2835  const std::pair<size_t, size_t> colRng (0, numVecs);
2836 
2837  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2838  // type of the return value of subview. This is because if we
2839  // switch the array layout from LayoutLeft to LayoutRight
2840  // (preferred for performance of block operations), the types
2841  // below won't be valid. (A view of a column of a LayoutRight
2842  // multivector has LayoutStride, not LayoutLeft.)
2843 
2844  // If we need sync to device, then host has the most recent version.
2845  const bool useHostVersion = this->template need_sync<device_type> ();
2846  if (useHostVersion) {
2847  // Work in host memory. This means we need to create a host
2848  // mirror of the input View of coefficients.
2849  auto alphas_h = Kokkos::create_mirror_view (alphas);
2850  typedef typename decltype (alphas_h)::memory_space cur_memory_space;
2851  Kokkos::deep_copy (alphas_h, alphas);
2852 
2853  auto Y_lcl =
2854  Kokkos::subview (this->template getLocalView<cur_memory_space> (),
2855  rowRng, Kokkos::ALL ());
2856  if (isConstantStride ()) {
2857  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2858  }
2859  else {
2860  for (size_t k = 0; k < numVecs; ++k) {
2861  const size_t Y_col = this->isConstantStride () ? k :
2862  this->whichVectors_[k];
2863  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2864  // We don't have to use the entire 1-D View here; we can use
2865  // the version that takes a scalar coefficient.
2866  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2867  }
2868  }
2869  }
2870  else { // Work in device memory, using the input View 'alphas' directly.
2871  auto Y_lcl =
2872  Kokkos::subview (this->template getLocalView<device_type> (),
2873  rowRng, Kokkos::ALL());
2874  if (isConstantStride ()) {
2875  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2876  }
2877  else {
2878  for (size_t k = 0; k < numVecs; ++k) {
2879  const size_t Y_col = this->isConstantStride () ? k :
2880  this->whichVectors_[k];
2881  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2882  //
2883  // FIXME (mfh 08 Apr 2015) This assumes UVM. It would be
2884  // better to fix scal() so that it takes a 0-D View as the
2885  // second argument.
2886  //
2887  KokkosBlas::scal (Y_k, alphas(k), Y_k);
2888  }
2889  }
2890  }
2891  }
2892 
2893  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2894  void
2896  scale (const Scalar& alpha,
2898  {
2899  using Kokkos::ALL;
2900  using Kokkos::subview;
2902  typedef typename dual_view_type::t_dev::memory_space dev_memory_space;
2903 
2904  const char tfecfFuncName[] = "scale: ";
2905 
2906  const size_t lclNumRows = getLocalLength ();
2907  const size_t numVecs = getNumVectors ();
2908 
2909  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2910  lclNumRows != A.getLocalLength (), std::invalid_argument,
2911  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2912  << A.getLocalLength () << ".");
2913  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2914  numVecs != A.getNumVectors (), std::invalid_argument,
2915  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2916  << A.getNumVectors () << ".");
2917 
2918  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2919  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2920  const std::pair<size_t, size_t> colRng (0, numVecs);
2921 
2922  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2923  if (this->template need_sync<dev_memory_space> ()) this->template sync<dev_memory_space> ();
2924  if (A.template need_sync<dev_memory_space> ()) const_cast<MV&>(A).template sync<dev_memory_space> ();
2925 
2926  this->template modify<dev_memory_space> ();
2927  auto Y_lcl_orig = this->template getLocalView<dev_memory_space> ();
2928  auto X_lcl_orig = A.template getLocalView<dev_memory_space> ();
2929  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2930  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2931 
2932  if (isConstantStride () && A.isConstantStride ()) {
2933  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2934  }
2935  else {
2936  // Make sure that Kokkos only uses the local length for add.
2937  for (size_t k = 0; k < numVecs; ++k) {
2938  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2939  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2940  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2941  auto X_k = subview (X_lcl, ALL (), X_col);
2942 
2943  KokkosBlas::scal (Y_k, theAlpha, X_k);
2944  }
2945  }
2946  }
2947 
2948 
2949 
2950  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2951  void
2954  {
2956  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
2957 
2958  const char tfecfFuncName[] = "reciprocal: ";
2959 
2960  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2961  getLocalLength () != A.getLocalLength (), std::runtime_error,
2962  "MultiVectors do not have the same local length. "
2963  "this->getLocalLength() = " << getLocalLength ()
2964  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2965  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2966  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2967  ": MultiVectors do not have the same number of columns (vectors). "
2968  "this->getNumVectors() = " << getNumVectors ()
2969  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2970 
2971  const size_t numVecs = getNumVectors ();
2972 
2973  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2974  if (this->template need_sync<dev_memory_space> ()) this->template sync<dev_memory_space> ();
2975  if (A.template need_sync<dev_memory_space> ()) const_cast<MV&>(A).template sync<dev_memory_space> ();
2976  this->template modify<dev_memory_space> ();
2977 
2978  auto this_view_dev = this->template getLocalView<dev_memory_space> ();
2979  auto A_view_dev = A.template getLocalView<dev_memory_space> ();
2980 
2981  if (isConstantStride () && A.isConstantStride ()) {
2982  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2983  }
2984  else {
2985  using Kokkos::ALL;
2986  using Kokkos::subview;
2987  for (size_t k = 0; k < numVecs; ++k) {
2988  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2989  auto vector_k = subview (this_view_dev, ALL (), this_col);
2990  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2991  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2992  KokkosBlas::reciprocal (vector_k, vector_Ak);
2993  }
2994  }
2995  }
2996 
2997  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2998  void
3001  {
3003  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
3004 
3005  const char tfecfFuncName[] = "abs";
3006  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3007  getLocalLength () != A.getLocalLength (), std::runtime_error,
3008  ": MultiVectors do not have the same local length. "
3009  "this->getLocalLength() = " << getLocalLength ()
3010  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3011  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3012  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3013  ": MultiVectors do not have the same number of columns (vectors). "
3014  "this->getNumVectors() = " << getNumVectors ()
3015  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3016  const size_t numVecs = getNumVectors ();
3017 
3018  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
3019  if (this->template need_sync<dev_memory_space> ()) this->template sync<dev_memory_space> ();
3020  if (A.template need_sync<dev_memory_space> ()) const_cast<MV&>(A).template sync<dev_memory_space> ();
3021  this->template modify<dev_memory_space> ();
3022 
3023  auto this_view_dev = this->template getLocalView<dev_memory_space> ();
3024  auto A_view_dev = A.template getLocalView<dev_memory_space> ();
3025 
3026  if (isConstantStride () && A.isConstantStride ()) {
3027  KokkosBlas::abs (this_view_dev, A_view_dev);
3028  }
3029  else {
3030  using Kokkos::ALL;
3031  using Kokkos::subview;
3032 
3033  for (size_t k=0; k < numVecs; ++k) {
3034  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3035  auto vector_k = subview (this_view_dev, ALL (), this_col);
3036  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3037  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3038  KokkosBlas::abs (vector_k, vector_Ak);
3039  }
3040  }
3041  }
3042 
3043  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3044  void
3046  update (const Scalar& alpha,
3048  const Scalar& beta)
3049  {
3050  const char tfecfFuncName[] = "update: ";
3051  using Kokkos::subview;
3052  using Kokkos::ALL;
3054  typedef typename dual_view_type::t_dev::memory_space dev_memory_space;
3055 
3056  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3057 
3058  const size_t lclNumRows = getLocalLength ();
3059  const size_t numVecs = getNumVectors ();
3060 
3061  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3062  lclNumRows != A.getLocalLength (), std::invalid_argument,
3063  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3064  << A.getLocalLength () << ".");
3065  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3066  numVecs != A.getNumVectors (), std::invalid_argument,
3067  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3068  << A.getNumVectors () << ".");
3069 
3070  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
3071  if (this->template need_sync<dev_memory_space> ()) this->template sync<dev_memory_space> ();
3072  if (A.template need_sync<dev_memory_space> ()) const_cast<MV&>(A).template sync<dev_memory_space> ();
3073 
3074  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3075  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3076  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3077  const std::pair<size_t, size_t> colRng (0, numVecs);
3078 
3079  auto Y_lcl_orig = this->template getLocalView<dev_memory_space> ();
3080  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3081  auto X_lcl_orig = A.template getLocalView<dev_memory_space> ();
3082  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3083 
3084  // The device memory of *this is about to be modified
3085  this->template modify<dev_memory_space> ();
3086  if (isConstantStride () && A.isConstantStride ()) {
3087  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3088  }
3089  else {
3090  // Make sure that Kokkos only uses the local length for add.
3091  for (size_t k = 0; k < numVecs; ++k) {
3092  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3093  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3094  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3095  auto X_k = subview (X_lcl, ALL (), X_col);
3096 
3097  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3098  }
3099  }
3100  }
3101 
3102  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3103  void
3105  update (const Scalar& alpha,
3107  const Scalar& beta,
3109  const Scalar& gamma)
3110  {
3111  using Kokkos::ALL;
3112  using Kokkos::subview;
3114  typedef typename dual_view_type::t_dev::memory_space dev_memory_space;
3115 
3116  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3117 
3118  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3119 
3120  const size_t lclNumRows = this->getLocalLength ();
3121  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3122  lclNumRows != A.getLocalLength (), std::invalid_argument,
3123  "The input MultiVector A has " << A.getLocalLength () << " local "
3124  "row(s), but this MultiVector has " << lclNumRows << " local "
3125  "row(s).");
3126  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3127  lclNumRows != B.getLocalLength (), std::invalid_argument,
3128  "The input MultiVector B has " << B.getLocalLength () << " local "
3129  "row(s), but this MultiVector has " << lclNumRows << " local "
3130  "row(s).");
3131  const size_t numVecs = getNumVectors ();
3132  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3133  A.getNumVectors () != numVecs, std::invalid_argument,
3134  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3135  "but this MultiVector has " << numVecs << " column(s).");
3136  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3137  B.getNumVectors () != numVecs, std::invalid_argument,
3138  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3139  "but this MultiVector has " << numVecs << " column(s).");
3140 
3141  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3142  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3143  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3144 
3145  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
3146  if (this->template need_sync<dev_memory_space> ()) this->template sync<dev_memory_space> ();
3147  if (A.template need_sync<dev_memory_space> ()) const_cast<MV&>(A).template sync<dev_memory_space> ();
3148  if (B.template need_sync<dev_memory_space> ()) const_cast<MV&>(B).template sync<dev_memory_space> ();
3149 
3150  // This method modifies *this.
3151  this->template modify<dev_memory_space> ();
3152 
3153  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3154  const std::pair<size_t, size_t> colRng (0, numVecs);
3155 
3156  // Prefer 'auto' over specifying the type explicitly. This avoids
3157  // issues with a subview possibly having a different type than the
3158  // original view.
3159  auto C_lcl = subview (this->template getLocalView<dev_memory_space> (), rowRng, ALL ());
3160  auto A_lcl = subview (A.template getLocalView<dev_memory_space> (), rowRng, ALL ());
3161  auto B_lcl = subview (B.template getLocalView<dev_memory_space> (), rowRng, ALL ());
3162 
3163  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3164  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3165  }
3166  else {
3167  // Some input (or *this) is not constant stride,
3168  // so perform the update one column at a time.
3169  for (size_t k = 0; k < numVecs; ++k) {
3170  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3171  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3172  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3173  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3174  theBeta, subview (B_lcl, rowRng, B_col),
3175  theGamma, subview (C_lcl, rowRng, this_col));
3176  }
3177  }
3178  }
3179 
3180  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3181  Teuchos::ArrayRCP<const Scalar>
3183  getData (size_t j) const
3184  {
3185  using Kokkos::ALL;
3186  using Kokkos::subview;
3188  const char tfecfFuncName[] = "getData: ";
3189 
3190  // Any MultiVector method that called the (classic) Kokkos Node's
3191  // viewBuffer or viewBufferNonConst methods always implied a
3192  // device->host synchronization. Thus, we synchronize here as
3193  // well.
3194  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3195 
3196  // Get a host view of the entire MultiVector's data.
3197  auto hostView = this->template getLocalView<Kokkos::HostSpace> ();
3198 
3199  // Get a subview of column j.
3200  const size_t col = isConstantStride () ? j : whichVectors_[j];
3201  auto hostView_j = subview (hostView, ALL (), col);
3202 
3203  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
3204  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3205  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3206 
3207  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3208  (static_cast<size_t> (hostView_j.extent (0)) <
3209  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3210  "hostView_j.extent(0) = " << hostView_j.extent (0)
3211  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3212  "Please report this bug to the Tpetra developers.");
3213 
3214  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3215  }
3216 
3217  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3218  Teuchos::ArrayRCP<Scalar>
3221  {
3222  using Kokkos::ALL;
3223  using Kokkos::subview;
3225  const char tfecfFuncName[] = "getDataNonConst: ";
3226 
3227  // Any MultiVector method that called the (classic) Kokkos Node's
3228  // viewBuffer or viewBufferNonConst methods always implied a
3229  // device->host synchronization. Thus, we synchronize here as
3230  // well.
3231  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3232 
3233  // Calling getDataNonConst() implies that the user plans to modify
3234  // the values in the MultiVector, so we mark the host data as modified.
3235  this->template modify<Kokkos::HostSpace> ();
3236 
3237  // Get a host view of the entire MultiVector's data.
3238  auto hostView = this->template getLocalView<Kokkos::HostSpace> ();
3239 
3240  // Get a subview of column j.
3241  const size_t col = isConstantStride () ? j : whichVectors_[j];
3242  auto hostView_j = subview (hostView, ALL (), col);
3243 
3244  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
3245  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3246  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3247 
3248  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3249  (static_cast<size_t> (hostView_j.extent (0)) <
3250  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3251  "hostView_j.extent(0) = " << hostView_j.extent (0)
3252  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3253  "Please report this bug to the Tpetra developers.");
3254 
3255  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3256  }
3257 
3258  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3262  {
3263  if (this != &source) {
3264  base_type::operator= (source);
3265  //
3266  // operator= implements view semantics (shallow copy).
3267  //
3268 
3269  // Kokkos::View operator= also implements view semantics.
3270  view_ = source.view_;
3271  origView_ = source.origView_;
3272 
3273  // NOTE (mfh 24 Mar 2014) Christian wrote here that assigning
3274  // whichVectors_ is "probably not ok" (probably constitutes deep
3275  // copy). I would say that it's OK, because whichVectors_ is
3276  // immutable (from the user's perspective); it's analogous to
3277  // the dimensions or stride. Once we make whichVectors_ a
3278  // Kokkos::View instead of a Teuchos::Array, all debate will go
3279  // away and we will unquestionably have view semantics.
3280  whichVectors_ = source.whichVectors_;
3281  }
3282  return *this;
3283  }
3284 
3285 
3286  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3287  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3289  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3290  {
3291  using Teuchos::RCP;
3293 
3294  // Check whether the index set in cols is contiguous. If it is,
3295  // use the more efficient Range1D version of subCopy.
3296  bool contiguous = true;
3297  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3298  for (size_t j = 1; j < numCopyVecs; ++j) {
3299  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3300  contiguous = false;
3301  break;
3302  }
3303  }
3304  if (contiguous && numCopyVecs > 0) {
3305  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3306  }
3307  else {
3308  RCP<const MV> X_sub = this->subView (cols);
3309  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3310  Y->assign (*X_sub);
3311  return Y;
3312  }
3313  }
3314 
3315  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3316  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3318  subCopy (const Teuchos::Range1D &colRng) const
3319  {
3320  using Teuchos::RCP;
3322 
3323  RCP<const MV> X_sub = this->subView (colRng);
3324  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3325  Y->assign (*X_sub);
3326  return Y;
3327  }
3328 
3329  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3330  size_t
3333  return origView_.extent (0);
3334  }
3335 
3336  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3337  size_t
3340  return origView_.extent (1);
3341  }
3342 
3343  template <class Scalar, class LO, class GO, class Node>
3346  const map_type& subMap,
3347  const size_t offset) :
3348  base_type (Teuchos::null) // to be replaced below
3349  {
3350  using Kokkos::ALL;
3351  using Kokkos::subview;
3352  using Teuchos::RCP;
3353  using Teuchos::rcp;
3355  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3356 
3357  const size_t newNumRows = subMap.getNodeNumElements ();
3358  const bool tooManyElts = newNumRows + offset > X.getOrigNumLocalRows ();
3359  if (tooManyElts) {
3360  const int myRank = X.getMap ()->getComm ()->getRank ();
3361  TEUCHOS_TEST_FOR_EXCEPTION(
3362  newNumRows + offset > X.getLocalLength (), std::runtime_error,
3363  prefix << "Invalid input Map. The input Map owns " << newNumRows <<
3364  " entries on process " << myRank << ". offset = " << offset << ". "
3365  "Yet, the MultiVector contains only " << X.getOrigNumLocalRows () <<
3366  " rows on this process.");
3367  }
3368 
3369  // mfh 27 Sep 2017: This debugging code depends on being able to
3370  // declare variables that we can use below, so it's too hard for
3371  // now to use run-time control to enable this code.
3372 #ifdef HAVE_TPETRA_DEBUG
3373  const size_t strideBefore =
3374  X.isConstantStride () ? X.getStride () : static_cast<size_t> (0);
3375  const size_t lclNumRowsBefore = X.getLocalLength ();
3376  const size_t numColsBefore = X.getNumVectors ();
3377  const impl_scalar_type* hostPtrBefore =
3378  X.template getLocalView<Kokkos::HostSpace> ().data ();
3379 #endif // HAVE_TPETRA_DEBUG
3380 
3381  const std::pair<size_t, size_t> origRowRng (offset, X.origView_.extent (0));
3382  const std::pair<size_t, size_t> rowRng (offset, offset + newNumRows);
3383 
3384  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3385  // FIXME (mfh 29 Sep 2016) If we just use X.view_ here, it breaks
3386  // CrsMatrix's Gauss-Seidel implementation (which assumes the
3387  // ability to create domain Map views of column Map MultiVectors,
3388  // and then get the original column Map MultiVector out again).
3389  // If we just use X.origView_ here, it breaks the fix for #46.
3390  // The test for offset == 0 is a hack that makes both tests pass,
3391  // but doesn't actually fix the more general issue. In
3392  // particular, the right way to fix Gauss-Seidel would be to fix
3393  // #385; that would make "getting the original column Map
3394  // MultiVector out again" unnecessary.
3395  dual_view_type newView = subview (offset == 0 ? X.origView_ : X.view_, rowRng, ALL ());
3396 
3397  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3398  // handling subviews of degenerate Views quite so well. For some
3399  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3400  // 0. We work around by creating a new empty DualView of the
3401  // desired (degenerate) dimensions.
3402  if (newOrigView.extent (0) == 0 &&
3403  newOrigView.extent (1) != X.origView_.extent (1)) {
3404  newOrigView = allocDualView<Scalar, LO, GO, Node> (size_t (0),
3405  X.getNumVectors ());
3406  }
3407  if (newView.extent (0) == 0 &&
3408  newView.extent (1) != X.view_.extent (1)) {
3409  newView = allocDualView<Scalar, LO, GO, Node> (size_t (0),
3410  X.getNumVectors ());
3411  }
3412 
3413  MV subViewMV = X.isConstantStride () ?
3414  MV (Teuchos::rcp (new map_type (subMap)), newView, newOrigView) :
3415  MV (Teuchos::rcp (new map_type (subMap)), newView, newOrigView, X.whichVectors_ ());
3416 
3417 #ifdef HAVE_TPETRA_DEBUG
3418  const size_t strideAfter = X.isConstantStride () ?
3419  X.getStride () :
3420  static_cast<size_t> (0);
3421  const size_t lclNumRowsAfter = X.getLocalLength ();
3422  const size_t numColsAfter = X.getNumVectors ();
3423  const impl_scalar_type* hostPtrAfter =
3424  X.template getLocalView<Kokkos::HostSpace> ().data ();
3425 
3426  const size_t strideRet = subViewMV.isConstantStride () ?
3427  subViewMV.getStride () :
3428  static_cast<size_t> (0);
3429  const size_t lclNumRowsRet = subViewMV.getLocalLength ();
3430  const size_t numColsRet = subViewMV.getNumVectors ();
3431 
3432  const char suffix[] = ". This should never happen. Please report this "
3433  "bug to the Tpetra developers.";
3434 
3435  TEUCHOS_TEST_FOR_EXCEPTION(
3436  lclNumRowsRet != subMap.getNodeNumElements (),
3437  std::logic_error, prefix << "Returned MultiVector has a number of rows "
3438  "different than the number of local indices in the input Map. "
3439  "lclNumRowsRet: " << lclNumRowsRet << ", subMap.getNodeNumElements(): "
3440  << subMap.getNodeNumElements () << suffix);
3441  TEUCHOS_TEST_FOR_EXCEPTION(
3442  strideBefore != strideAfter || lclNumRowsBefore != lclNumRowsAfter ||
3443  numColsBefore != numColsAfter || hostPtrBefore != hostPtrAfter,
3444  std::logic_error, prefix << "Original MultiVector changed dimensions, "
3445  "stride, or host pointer after taking offset view. strideBefore: " <<
3446  strideBefore << ", strideAfter: " << strideAfter << ", lclNumRowsBefore: "
3447  << lclNumRowsBefore << ", lclNumRowsAfter: " << lclNumRowsAfter <<
3448  ", numColsBefore: " << numColsBefore << ", numColsAfter: " <<
3449  numColsAfter << ", hostPtrBefore: " << hostPtrBefore << ", hostPtrAfter: "
3450  << hostPtrAfter << suffix);
3451  TEUCHOS_TEST_FOR_EXCEPTION(
3452  strideBefore != strideRet, std::logic_error, prefix << "Returned "
3453  "MultiVector has different stride than original MultiVector. "
3454  "strideBefore: " << strideBefore << ", strideRet: " << strideRet <<
3455  ", numColsBefore: " << numColsBefore << ", numColsRet: " << numColsRet
3456  << suffix);
3457  TEUCHOS_TEST_FOR_EXCEPTION(
3458  numColsBefore != numColsRet, std::logic_error,
3459  prefix << "Returned MultiVector has a different number of columns than "
3460  "original MultiVector. numColsBefore: " << numColsBefore << ", "
3461  "numColsRet: " << numColsRet << suffix);
3462 #endif // HAVE_TPETRA_DEBUG
3463 
3464  *this = subViewMV; // shallow copy
3465  }
3466 
3467  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3468  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3470  offsetView (const Teuchos::RCP<const map_type>& subMap,
3471  const size_t offset) const
3472  {
3474  return Teuchos::rcp (new MV (*this, *subMap, offset));
3475  }
3476 
3477  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3478  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3480  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3481  const size_t offset)
3482  {
3484  return Teuchos::rcp (new MV (*this, *subMap, offset));
3485  }
3486 
3487  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3488  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3490  subView (const Teuchos::ArrayView<const size_t>& cols) const
3491  {
3492  using Teuchos::Array;
3493  using Teuchos::rcp;
3495 
3496  const size_t numViewCols = static_cast<size_t> (cols.size ());
3497  TEUCHOS_TEST_FOR_EXCEPTION(
3498  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3499  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3500  "contain at least one entry, but cols.size() = " << cols.size ()
3501  << " == 0.");
3502 
3503  // Check whether the index set in cols is contiguous. If it is,
3504  // use the more efficient Range1D version of subView.
3505  bool contiguous = true;
3506  for (size_t j = 1; j < numViewCols; ++j) {
3507  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3508  contiguous = false;
3509  break;
3510  }
3511  }
3512  if (contiguous) {
3513  if (numViewCols == 0) {
3514  // The output MV has no columns, so there is nothing to view.
3515  return rcp (new MV (this->getMap (), numViewCols));
3516  } else {
3517  // Use the more efficient contiguous-index-range version.
3518  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3519  }
3520  }
3521 
3522  if (isConstantStride ()) {
3523  return rcp (new MV (this->getMap (), view_, origView_, cols));
3524  }
3525  else {
3526  Array<size_t> newcols (cols.size ());
3527  for (size_t j = 0; j < numViewCols; ++j) {
3528  newcols[j] = whichVectors_[cols[j]];
3529  }
3530  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3531  }
3532  }
3533 
3534 
3535  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3536  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3538  subView (const Teuchos::Range1D& colRng) const
3539  {
3540  using ::Tpetra::Details::Behavior;
3541  using Kokkos::ALL;
3542  using Kokkos::subview;
3543  using Teuchos::Array;
3544  using Teuchos::RCP;
3545  using Teuchos::rcp;
3547  const char tfecfFuncName[] = "subView(Range1D): ";
3548 
3549  const size_t lclNumRows = this->getLocalLength ();
3550  const size_t numVecs = this->getNumVectors ();
3551  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3552  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3553  // "at least one vector.");
3554  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3555  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3556  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3557  << numVecs << ".");
3558  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3559  numVecs != 0 && colRng.size () != 0 &&
3560  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3561  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3562  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3563  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3564  "[0, " << numVecs << "].");
3565 
3566  RCP<const MV> X_ret; // the MultiVector subview to return
3567 
3568  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3569  // broken for the case of views with zero rows. I will brutally
3570  // enforce that the subview has the correct dimensions. In
3571  // particular, in the case of zero rows, I will, if necessary,
3572  // create a new dual_view_type with zero rows and the correct
3573  // number of columns. In a debug build, I will use an all-reduce
3574  // to ensure that it has the correct dimensions on all processes.
3575 
3576  const std::pair<size_t, size_t> rows (0, lclNumRows);
3577  if (colRng.size () == 0) {
3578  const std::pair<size_t, size_t> cols (0, 0); // empty range
3579  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3580  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3581  }
3582  else {
3583  // Returned MultiVector is constant stride only if *this is.
3584  if (isConstantStride ()) {
3585  const std::pair<size_t, size_t> cols (colRng.lbound (),
3586  colRng.ubound () + 1);
3587  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3588  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3589  }
3590  else {
3591  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3592  // We're only asking for one column, so the result does have
3593  // constant stride, even though this MultiVector does not.
3594  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3595  whichVectors_[0] + colRng.ubound () + 1);
3596  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3597  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3598  }
3599  else {
3600  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3601  whichVectors_.begin () + colRng.ubound () + 1);
3602  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3603  }
3604  }
3605  }
3606 
3607  const bool debug = Behavior::debug ();
3608  if (debug) {
3609  using Teuchos::Comm;
3610  using Teuchos::outArg;
3611  using Teuchos::REDUCE_MIN;
3612  using Teuchos::reduceAll;
3613 
3614  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3615  Teuchos::null : this->getMap ()->getComm ();
3616  if (! comm.is_null ()) {
3617  int lclSuccess = 1;
3618  int gblSuccess = 1;
3619 
3620  if (X_ret.is_null ()) {
3621  lclSuccess = 0;
3622  }
3623  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3624  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3625  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3626  "MultiVector; the return value of this method) is null on some MPI "
3627  "process in this MultiVector's communicator. This should never "
3628  "happen. Please report this bug to the Tpetra developers.");
3629  if (! X_ret.is_null () &&
3630  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3631  lclSuccess = 0;
3632  }
3633  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3634  outArg (gblSuccess));
3635  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3636  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3637  "colRng.size(), on at least one MPI process in this MultiVector's "
3638  "communicator. This should never happen. "
3639  "Please report this bug to the Tpetra developers.");
3640  }
3641  }
3642  return X_ret;
3643  }
3644 
3645 
3646  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3647  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3649  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3650  {
3652  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3653  }
3654 
3655 
3656  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3657  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3659  subViewNonConst (const Teuchos::Range1D &colRng)
3660  {
3662  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3663  }
3664 
3665 
3666  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3669  const size_t j)
3670  : base_type (X.getMap ())
3671  {
3672  using Kokkos::subview;
3673  typedef std::pair<size_t, size_t> range_type;
3674  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3675 
3676  const size_t numCols = X.getNumVectors ();
3677  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3678  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3679  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3680  const size_t jj = X.isConstantStride () ?
3681  static_cast<size_t> (j) :
3682  static_cast<size_t> (X.whichVectors_[j]);
3683  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3684  this->origView_ = X.origView_;
3685 
3686  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3687  // Export or Import operations with different subviews of a
3688  // MultiVector. Thus, it is safe to reuse communication buffers.
3689  // See #1560 discussion.
3690  //
3691  // We only need one column's worth of buffer for imports_ and
3692  // exports_. Taking subviews now ensures that their lengths will
3693  // be exactly what we need, so we won't have to resize them later.
3694  {
3695  const size_t newSize = X.imports_.extent (0) / numCols;
3696  auto newImports = X.imports_;
3697  newImports.d_view = subview (X.imports_.d_view, range_type (0, newSize));
3698  newImports.h_view = subview (X.imports_.h_view, range_type (0, newSize));
3699  }
3700  {
3701  const size_t newSize = X.exports_.extent (0) / numCols;
3702  auto newExports = X.exports_;
3703  newExports.d_view = subview (X.exports_.d_view, range_type (0, newSize));
3704  newExports.h_view = subview (X.exports_.h_view, range_type (0, newSize));
3705  }
3706  // These two DualViews already either have the right number of
3707  // entries, or zero entries. This means that we don't need to
3708  // resize them.
3709  this->numImportPacketsPerLID_ = X.numImportPacketsPerLID_;
3710  this->numExportPacketsPerLID_ = X.numExportPacketsPerLID_;
3711  }
3712 
3713 
3714  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3715  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3717  getVector (const size_t j) const
3718  {
3720  return Teuchos::rcp (new V (*this, j));
3721  }
3722 
3723 
3724  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3725  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3727  getVectorNonConst (const size_t j)
3728  {
3730  return Teuchos::rcp (new V (*this, j));
3731  }
3732 
3733 
3734  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3735  void
3737  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3738  {
3739  typedef decltype (this->template getLocalView<device_type> ())
3740  dev_view_type;
3741  typedef decltype (this->template getLocalView<Kokkos::HostSpace> ())
3742  host_view_type;
3743  typedef impl_scalar_type IST;
3744  typedef Kokkos::View<IST*, typename host_view_type::array_layout,
3745  Kokkos::HostSpace, Kokkos::MemoryUnmanaged> input_col_type;
3746  const char tfecfFuncName[] = "get1dCopy: ";
3747 
3748  const size_t numRows = this->getLocalLength ();
3749  const size_t numCols = this->getNumVectors ();
3750  const std::pair<size_t, size_t> rowRange (0, numRows);
3751 
3752  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3753  LDA < numRows, std::runtime_error,
3754  "LDA = " << LDA << " < numRows = " << numRows << ".");
3755  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3756  numRows > static_cast<size_t> (0) &&
3757  numCols > static_cast<size_t> (0) &&
3758  static_cast<size_t> (A.size ()) < LDA * (numCols - 1) + numRows,
3759  std::runtime_error,
3760  "A.size() = " << A.size () << ", but its size must be at least "
3761  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3762 
3763  // FIXME (mfh 22 Jul 2014, 10 Dec 2014) Currently, it doesn't work
3764  // to do a 2-D copy, even if this MultiVector has constant stride.
3765  // This is because Kokkos can't currently tell the difference
3766  // between padding (which permits a single deep_copy for the whole
3767  // 2-D View) and stride > numRows (which does NOT permit a single
3768  // deep_copy for the whole 2-D View). Carter is working on this,
3769  // but for now, the temporary fix is to copy one column at a time.
3770 
3771  // Use the most recently updated version of this MultiVector's
3772  // data. This avoids sync'ing, which could violate users'
3773  // expectations.
3774  //
3775  // If we need sync to device, then host has the most recent version.
3776  const bool useHostVersion = this->template need_sync<device_type> ();
3777 
3778  dev_view_type srcView_dev;
3779  host_view_type srcView_host;
3780  if (useHostVersion) {
3781  srcView_host = this->template getLocalView<Kokkos::HostSpace> ();
3782  }
3783  else {
3784  srcView_dev = this->template getLocalView<device_type> ();
3785  }
3786 
3787  for (size_t j = 0; j < numCols; ++j) {
3788  const size_t srcCol =
3789  this->isConstantStride () ? j : this->whichVectors_[j];
3790  const size_t dstCol = j;
3791  IST* const dstColRaw =
3792  reinterpret_cast<IST*> (A.getRawPtr () + LDA * dstCol);
3793  input_col_type dstColView (dstColRaw, numRows);
3794 
3795  if (useHostVersion) {
3796  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3797  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3798  dstColView.extent (0) != srcColView_host.extent (0),
3799  std::logic_error, ": srcColView and dstColView_host have different "
3800  "dimensions. Please report this bug to the Tpetra developers.");
3801  Kokkos::deep_copy (dstColView, srcColView_host);
3802  }
3803  else {
3804  auto srcColView_dev = Kokkos::subview (srcView_dev, rowRange, srcCol);
3805  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3806  dstColView.extent (0) != srcColView_dev.extent (0),
3807  std::logic_error, ": srcColView and dstColView_dev have different "
3808  "dimensions. Please report this bug to the Tpetra developers.");
3809  Kokkos::deep_copy (dstColView, srcColView_dev);
3810  }
3811  }
3812  }
3813 
3814 
3815  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3816  void
3818  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3819  {
3821  const char tfecfFuncName[] = "get2dCopy: ";
3822  const size_t numRows = this->getLocalLength ();
3823  const size_t numCols = this->getNumVectors ();
3824 
3825  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3826  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3827  std::runtime_error, "Input array of pointers must contain as many "
3828  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3829  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3830 
3831  if (numRows != 0 && numCols != 0) {
3832  // No side effects until we've validated the input.
3833  for (size_t j = 0; j < numCols; ++j) {
3834  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3836  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3837  "the input array of arrays is not long enough to fit all entries in "
3838  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3839  << " < getLocalLength() = " << numRows << ".");
3840  }
3841 
3842  // We've validated the input, so it's safe to start copying.
3843  for (size_t j = 0; j < numCols; ++j) {
3844  Teuchos::RCP<const V> X_j = this->getVector (j);
3845  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3846  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3847  }
3848  }
3849  }
3850 
3851  namespace { // (anonymous)
3852  template <class SC, class LO, class GO, class NT>
3854  syncMVToHostIfNeededAndGetHostView (MultiVector<SC, LO, GO, NT>& X,
3855  const bool markModified)
3856  {
3857  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3858  // (replace ? with 1 or 2) have always been device->host
3859  // synchronization points, since <= 2012. We retain this
3860  // behavior for backwards compatibility.
3861  using host_memory_space = Kokkos::HostSpace;
3862  if (X.template need_sync<host_memory_space> ()) {
3863  X.template sync<host_memory_space> ();
3864  }
3865  if (markModified) {
3866  X.template modify<host_memory_space> ();
3867  }
3868  // get{1,2}dView() and get{1,2}dViewNonConst() all return a host
3869  // view of the data.
3870  return X.template getLocalView<host_memory_space> ();
3871  }
3872  } // namespace (anonymous)
3873 
3874 
3875  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3876  Teuchos::ArrayRCP<const Scalar>
3877  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
3878  get1dView () const
3879  {
3880  if (getLocalLength () == 0 || getNumVectors () == 0) {
3881  return Teuchos::null;
3882  } else {
3883  TEUCHOS_TEST_FOR_EXCEPTION(
3884  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3885  "get1dView: This MultiVector does not have constant stride, so it is "
3886  "not possible to view its data as a single array. You may check "
3887  "whether a MultiVector has constant stride by calling "
3888  "isConstantStride().");
3889  // Since get1dView() is and was always marked const, I have to
3890  // cast away const here in order not to break backwards
3891  // compatibility.
3892  constexpr bool markModified = false;
3894  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3895  markModified);
3896  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3897  Kokkos::Compat::persistingView (X_lcl);
3898  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3899  }
3900  }
3901 
3902  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3903  Teuchos::ArrayRCP<Scalar>
3906  {
3907  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3908  return Teuchos::null;
3909  }
3910  else {
3911  TEUCHOS_TEST_FOR_EXCEPTION
3912  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3913  "get1dViewNonConst: This MultiVector does not have constant stride, "
3914  "so it is not possible to view its data as a single array. You may "
3915  "check whether a MultiVector has constant stride by calling "
3916  "isConstantStride().");
3917  constexpr bool markModified = true;
3918  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3919  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3920  Kokkos::Compat::persistingView (X_lcl);
3921  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3922  }
3923  }
3924 
3925  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3926  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3929  {
3930  constexpr bool markModified = true;
3931  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3932 
3933  // Don't use the row range here on the outside, in order to avoid
3934  // a strided return type (in case Kokkos::subview is conservative
3935  // about that). Instead, use the row range for the column views
3936  // in the loop.
3937  const size_t myNumRows = this->getLocalLength ();
3938  const size_t numCols = this->getNumVectors ();
3939  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3940 
3941  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3942  for (size_t j = 0; j < numCols; ++j) {
3943  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3944  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3945  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3946  Kokkos::Compat::persistingView (X_lcl_j);
3947  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3948  }
3949  return views;
3950  }
3951 
3952  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3953  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3955  get2dView () const
3956  {
3957  // Since get2dView() is and was always marked const, I have to
3958  // cast away const here in order not to break backwards
3959  // compatibility.
3960  constexpr bool markModified = false;
3962  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3963  markModified);
3964  // Don't use the row range here on the outside, in order to avoid
3965  // a strided return type (in case Kokkos::subview is conservative
3966  // about that). Instead, use the row range for the column views
3967  // in the loop.
3968  const size_t myNumRows = this->getLocalLength ();
3969  const size_t numCols = this->getNumVectors ();
3970  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3971 
3972  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3973  for (size_t j = 0; j < numCols; ++j) {
3974  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3975  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3976  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3977  Kokkos::Compat::persistingView (X_lcl_j);
3978  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3979  }
3980  return views;
3981  }
3982 
3983  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3984  void
3986  multiply (Teuchos::ETransp transA,
3987  Teuchos::ETransp transB,
3988  const Scalar& alpha,
3991  const Scalar& beta)
3992  {
3993  using Teuchos::CONJ_TRANS;
3994  using Teuchos::NO_TRANS;
3995  using Teuchos::TRANS;
3996  using Teuchos::RCP;
3997  using Teuchos::rcp;
3998  using Teuchos::rcpFromRef;
3999  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
4000  typedef Teuchos::ScalarTraits<Scalar> STS;
4002  const char tfecfFuncName[] = "multiply: ";
4003  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::multiply");
4004 
4005  // This routine performs a variety of matrix-matrix multiply
4006  // operations, interpreting the MultiVector (this-aka C , A and B)
4007  // as 2D matrices. Variations are due to the fact that A, B and C
4008  // can be local replicated or global distributed MultiVectors and
4009  // that we may or may not operate with the transpose of A and B.
4010  // Possible cases are:
4011  //
4012  // Operations # Cases Notes
4013  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4014  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4015  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4016  //
4017  // The following operations are not meaningful for 1-D
4018  // distributions:
4019  //
4020  // u1) C(local) = A^T(distr) * B^T(distr) 1
4021  // u2) C(local) = A (distr) * B^X(distr) 2
4022  // u3) C(distr) = A^X(local) * B^X(local) 4
4023  // u4) C(distr) = A^X(local) * B^X(distr) 4
4024  // u5) C(distr) = A^T(distr) * B^X(local) 2
4025  // u6) C(local) = A^X(distr) * B^X(local) 4
4026  // u7) C(distr) = A^X(distr) * B^X(local) 4
4027  // u8) C(local) = A^X(local) * B^X(distr) 4
4028  //
4029  // Total number of cases: 32 (= 2^5).
4030 
4031  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4032 
4033  // In debug mode, check compatibility of local dimensions. We
4034  // only do this in debug mode, since it requires an all-reduce
4035  // to ensure correctness on all processses. It's entirely
4036  // possible that only some processes may have incompatible local
4037  // dimensions. Throwing an exception only on those processes
4038  // could cause this method to hang.
4039  const bool debug = ::Tpetra::Details::Behavior::debug ();
4040  if (debug) {
4041  if (! this->getMap ().is_null () && ! this->getMap ()->getComm ().is_null ()) {
4042  using Teuchos::Comm;
4043  using Teuchos::RCP;
4044  using Teuchos::REDUCE_MIN;
4045  using Teuchos::reduceAll;
4046  using Teuchos::outArg;
4047 
4048  RCP<const Comm<int> > comm = this->getMap ()->getComm ();
4049  const size_t A_nrows =
4050  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4051  const size_t A_ncols =
4052  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4053  const size_t B_nrows =
4054  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4055  const size_t B_ncols =
4056  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4057 
4058  const bool lclBad = this->getLocalLength () != A_nrows ||
4059  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4060  const int lclGood = lclBad ? 0 : 1;
4061  int gblGood = 0;
4062  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4063 
4064  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4065  (gblGood != 1, std::runtime_error, "Local dimensions of '*this', "
4066  "op(A), and op(B) are not consistent on at least one process "
4067  "in this object's communicator.");
4068  }
4069  }
4070 
4071  const bool A_is_local = ! A.isDistributed ();
4072  const bool B_is_local = ! B.isDistributed ();
4073  const bool C_is_local = ! this->isDistributed ();
4074  // Case 1: C(local) = A^X(local) * B^X(local)
4075  const bool Case1 = C_is_local && A_is_local && B_is_local;
4076  // Case 2: C(local) = A^T(distr) * B (distr)
4077  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4078  transA != NO_TRANS &&
4079  transB == NO_TRANS;
4080  // Case 3: C(distr) = A (distr) * B^X(local)
4081  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4082  transA == NO_TRANS;
4083 
4084  // Test that we are considering a meaningful case
4085  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4086  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4087  "Multiplication of op(A) and op(B) into *this is not a "
4088  "supported use case.");
4089 
4090  if (beta != STS::zero () && Case2) {
4091  // If Case2, then C is local and contributions must be summed
4092  // across all processes. However, if beta != 0, then accumulate
4093  // beta*C into the sum. When summing across all processes, we
4094  // only want to accumulate this once, so set beta == 0 on all
4095  // processes except Process 0.
4096  const int myRank = this->getMap ()->getComm ()->getRank ();
4097  if (myRank != 0) {
4098  beta_local = ATS::zero ();
4099  }
4100  }
4101 
4102  // We only know how to do matrix-matrix multiplies if all the
4103  // MultiVectors have constant stride. If not, we have to make
4104  // temporary copies of those MultiVectors (including possibly
4105  // *this) that don't have constant stride.
4106  RCP<MV> C_tmp;
4107  if (! isConstantStride ()) {
4108  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4109  } else {
4110  C_tmp = rcp (this, false);
4111  }
4112 
4113  RCP<const MV> A_tmp;
4114  if (! A.isConstantStride ()) {
4115  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4116  } else {
4117  A_tmp = rcpFromRef (A);
4118  }
4119 
4120  RCP<const MV> B_tmp;
4121  if (! B.isConstantStride ()) {
4122  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4123  } else {
4124  B_tmp = rcpFromRef (B);
4125  }
4126 
4127  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4128  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4129  ! A_tmp->isConstantStride (), std::logic_error,
4130  "Failed to make temporary constant-stride copies of MultiVectors.");
4131 
4132  {
4133  typedef LocalOrdinal LO;
4134 
4135  const LO A_lclNumRows = A_tmp->getLocalLength ();
4136  const LO A_numVecs = A_tmp->getNumVectors ();
4137  auto A_lcl = A_tmp->template getLocalView<device_type> ();
4138  auto A_sub = Kokkos::subview (A_lcl,
4139  std::make_pair (LO (0), A_lclNumRows),
4140  std::make_pair (LO (0), A_numVecs));
4141  const LO B_lclNumRows = B_tmp->getLocalLength ();
4142  const LO B_numVecs = B_tmp->getNumVectors ();
4143  auto B_lcl = B_tmp->template getLocalView<device_type> ();
4144  auto B_sub = Kokkos::subview (B_lcl,
4145  std::make_pair (LO (0), B_lclNumRows),
4146  std::make_pair (LO (0), B_numVecs));
4147  const LO C_lclNumRows = C_tmp->getLocalLength ();
4148  const LO C_numVecs = C_tmp->getNumVectors ();
4149  auto C_lcl = C_tmp->template getLocalView<device_type> ();
4150  auto C_sub = Kokkos::subview (C_lcl,
4151  std::make_pair (LO (0), C_lclNumRows),
4152  std::make_pair (LO (0), C_numVecs));
4153  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4154  (transA == Teuchos::TRANS ? 'T' : 'C'));
4155  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4156  (transB == Teuchos::TRANS ? 'T' : 'C'));
4157  const impl_scalar_type alpha_IST (alpha);
4158 
4159  ::Tpetra::Details::ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4160  KokkosBlas::gemm(&ctransA, &ctransB, alpha_IST, A_sub, B_sub, beta_local, C_sub);
4161  }
4162 
4163  if (! isConstantStride ()) {
4164  deep_copy (*this, *C_tmp); // Copy the result back into *this.
4165  }
4166 
4167  // Dispose of (possibly) extra copies of A and B.
4168  A_tmp = Teuchos::null;
4169  B_tmp = Teuchos::null;
4170 
4171  // If Case 2 then sum up *this and distribute it to all processes.
4172  if (Case2) {
4173  this->reduce ();
4174  }
4175  }
4176 
4177  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4178  void
4180  elementWiseMultiply (Scalar scalarAB,
4183  Scalar scalarThis)
4184  {
4185  using Kokkos::ALL;
4186  using Kokkos::subview;
4189  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
4190 
4191  const char tfecfFuncName[] = "elementWiseMultiply: ";
4192 
4193  const size_t lclNumRows = this->getLocalLength ();
4194  const size_t numVecs = this->getNumVectors ();
4195 
4196  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4197  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4198  std::runtime_error, "MultiVectors do not have the same local length.");
4199  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4200  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4201  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4202  << ".");
4203 
4204  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
4205  if (this->template need_sync<dev_memory_space> ()) this->template sync<dev_memory_space> ();
4206  if (A.template need_sync<dev_memory_space> ()) const_cast<V&>(A).template sync<dev_memory_space> ();
4207  if (B.template need_sync<dev_memory_space> ()) const_cast<MV&>(B).template sync<dev_memory_space> ();
4208 
4209  this->template modify<dev_memory_space> ();
4210 
4211  auto this_view = this->template getLocalView<dev_memory_space> ();
4212  auto A_view = A.template getLocalView<dev_memory_space> ();
4213  auto B_view = B.template getLocalView<dev_memory_space> ();
4214 
4215  if (isConstantStride () && B.isConstantStride ()) {
4216  // A is just a Vector; it only has one column, so it always has
4217  // constant stride.
4218  //
4219  // If both *this and B have constant stride, we can do an
4220  // element-wise multiply on all columns at once.
4221  KokkosBlas::mult (scalarThis,
4222  this_view,
4223  scalarAB,
4224  subview (A_view, ALL (), 0),
4225  B_view);
4226  }
4227  else {
4228  for (size_t j = 0; j < numVecs; ++j) {
4229  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4230  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4231  KokkosBlas::mult (scalarThis,
4232  subview (this_view, ALL (), C_col),
4233  scalarAB,
4234  subview (A_view, ALL (), 0),
4235  subview (B_view, ALL (), B_col));
4236  }
4237  }
4238  }
4239 
4240  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4241  void
4244  {
4245  using Teuchos::reduceAll;
4246  using Teuchos::REDUCE_SUM;
4247  typedef decltype (this->template getLocalView<device_type> ())
4248  dev_view_type;
4249  typedef decltype (this->template getLocalView<Kokkos::HostSpace> ())
4250  host_view_type;
4251 
4252  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::reduce");
4253 
4254  TEUCHOS_TEST_FOR_EXCEPTION
4255  (this->isDistributed (), std::runtime_error,
4256  "Tpetra::MultiVector::reduce should only be called with locally "
4257  "replicated or otherwise not distributed MultiVector objects.");
4258  const Teuchos::Comm<int>& comm = * (this->getMap ()->getComm ());
4259  if (comm.getSize () == 1) {
4260  return; // MultiVector is already "reduced" to one process
4261  }
4262 
4263  const size_t lclNumRows = getLocalLength ();
4264  const size_t numCols = getNumVectors ();
4265  const size_t totalAllocSize = lclNumRows * numCols;
4266 
4267  // FIXME (mfh 16 June 2014) This exception will cause deadlock if
4268  // it triggers on only some processes. We don't have a good way
4269  // to pack this result into the all-reduce below, but this would
4270  // be a good reason to set a "local error flag" and find other
4271  // opportunities to let it propagate.
4272  TEUCHOS_TEST_FOR_EXCEPTION(
4273  lclNumRows > static_cast<size_t> (std::numeric_limits<int>::max ()),
4274  std::runtime_error, "Tpetra::MultiVector::reduce: On Process " <<
4275  comm.getRank () << ", the number of local rows " << lclNumRows <<
4276  " does not fit in int.");
4277 
4278  //
4279  // Use MPI to sum the entries across all local blocks.
4280  //
4281 
4282  // Get the most recently updated version of the data.
4283  //
4284  // If we need sync to device, then host has the most recent version.
4285  const bool useHostVersion = this->template need_sync<device_type> ();
4286  // Only one of these (the most recently updated one) is active.
4287  dev_view_type srcView_dev;
4288  host_view_type srcView_host;
4289  if (useHostVersion) {
4290  srcView_host = this->template getLocalView<Kokkos::HostSpace> ();
4291  if (lclNumRows != static_cast<size_t> (srcView_host.extent (0))) {
4292  // Make sure the number of rows is correct. If not, take a subview.
4293  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
4294  srcView_host = Kokkos::subview (srcView_host, rowRng, Kokkos::ALL ());
4295  }
4296  }
4297  else {
4298  srcView_dev = this->template getLocalView<device_type> ();
4299  if (lclNumRows != static_cast<size_t> (srcView_dev.extent (0))) {
4300  // Make sure the number of rows is correct. If not, take a subview.
4301  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
4302  srcView_dev = Kokkos::subview (srcView_dev, rowRng, Kokkos::ALL ());
4303  }
4304  }
4305 
4306  // If this MultiVector's local data are stored contiguously, we
4307  // can use the local View as the source buffer in the
4308  // MPI_Allreduce. Otherwise, we have to allocate a temporary
4309  // source buffer and pack.
4310  const bool contig = isConstantStride () && getStride () == lclNumRows;
4311  dev_view_type srcBuf_dev;
4312  host_view_type srcBuf_host;
4313  if (useHostVersion) {
4314  if (contig) {
4315  srcBuf_host = srcView_host; // use the View as-is
4316  }
4317  else { // need to repack into a contiguous buffer
4318  srcBuf_host = decltype (srcBuf_host) ("srcBuf", lclNumRows, numCols);
4319  Kokkos::deep_copy (srcBuf_host, srcView_host);
4320  }
4321  }
4322  else { // use device version
4323  if (contig) {
4324  srcBuf_dev = srcView_dev; // use the View as-is
4325  }
4326  else { // need to repack into a contiguous buffer
4327  srcBuf_dev = decltype (srcBuf_dev) ("srcBuf", lclNumRows, numCols);
4328  Kokkos::deep_copy (srcBuf_dev, srcView_dev);
4329  }
4330  }
4331 
4332  // Check expected invariant of the above block of code. At this
4333  // point, either the srcBuf of choice points to the srcView of
4334  // choice, or it has the right allocation size.
4335  {
4336  // Use >=, not ==, because if srcBuf just points to srcView,
4337  // then srcView may actually be bigger than what we need.
4338  const bool correct =
4339  (useHostVersion && static_cast<size_t> (srcBuf_host.size ()) >= totalAllocSize) ||
4340  (! useHostVersion && static_cast<size_t> (srcBuf_dev.size ()) >= totalAllocSize);
4341  TEUCHOS_TEST_FOR_EXCEPTION
4342  (! correct, std::logic_error, "Tpetra::MultiVector::reduce: Violated "
4343  "invariant of temporary source buffer construction. Please report "
4344  "this bug to the Tpetra developers.");
4345  }
4346 
4347  // MPI requires that the send and receive buffers don't alias one
4348  // another, so we have to copy temporary storage for the result.
4349  //
4350  // We expect that MPI implementations will know how to read device
4351  // pointers.
4352  dev_view_type tgtBuf_dev;
4353  host_view_type tgtBuf_host;
4354  if (useHostVersion) {
4355  tgtBuf_host = host_view_type ("tgtBuf", lclNumRows, numCols);
4356  }
4357  else {
4358  tgtBuf_dev = dev_view_type ("tgtBuf", lclNumRows, numCols);
4359  }
4360 
4361  const int reduceCount = static_cast<int> (totalAllocSize);
4362  if (useHostVersion) {
4363  TEUCHOS_TEST_FOR_EXCEPTION
4364  (static_cast<size_t> (tgtBuf_host.size ()) < totalAllocSize,
4365  std::logic_error, "Tpetra::MultiVector::reduce: tgtBuf_host.size() = "
4366  << tgtBuf_host.size () << " < lclNumRows*numCols = " << totalAllocSize
4367  << ". Please report this bug to the Tpetra developers.");
4368  reduceAll<int, impl_scalar_type> (comm, REDUCE_SUM, reduceCount,
4369  srcBuf_host.data (),
4370  tgtBuf_host.data ());
4371  }
4372  else { // use device version
4373  TEUCHOS_TEST_FOR_EXCEPTION
4374  (static_cast<size_t> (tgtBuf_dev.size ()) < totalAllocSize,
4375  std::logic_error, "Tpetra::MultiVector::reduce: tgtBuf_dev.size() = "
4376  << tgtBuf_dev.size () << " < lclNumRows*numCols = " << totalAllocSize
4377  << ". Please report this bug to the Tpetra developers.");
4378  reduceAll<int, impl_scalar_type> (comm, REDUCE_SUM, reduceCount,
4379  srcBuf_dev.data (),
4380  tgtBuf_dev.data ());
4381  }
4382 
4383  // Write back the results to *this.
4384  if (useHostVersion) {
4385  this->template modify<Kokkos::HostSpace> ();
4386  if (contig || isConstantStride ()) {
4387  Kokkos::deep_copy (srcView_host, tgtBuf_host);
4388  }
4389  else { // need to copy one column at a time
4390  for (size_t j = 0; j < numCols; ++j) {
4391  auto X_j_out = Kokkos::subview (srcView_host, Kokkos::ALL (), j);
4392  auto X_j_in = Kokkos::subview (tgtBuf_host, Kokkos::ALL (), j);
4393  Kokkos::deep_copy (X_j_out, X_j_in);
4394  }
4395  }
4396  }
4397  else { // use device version
4398  this->template modify<device_type> ();
4399  if (contig || isConstantStride ()) {
4400  Kokkos::deep_copy (srcView_dev, tgtBuf_dev);
4401  }
4402  else { // need to copy one column at a time
4403  for (size_t j = 0; j < numCols; ++j) {
4404  auto X_j_out = Kokkos::subview (srcView_dev, Kokkos::ALL (), j);
4405  auto X_j_in = Kokkos::subview (tgtBuf_dev, Kokkos::ALL (), j);
4406  Kokkos::deep_copy (X_j_out, X_j_in);
4407  }
4408  }
4409  }
4410 
4411  // We leave *this unsynchronized.
4412  }
4413 
4414 
4415  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4416  void
4418  replaceLocalValue (const LocalOrdinal lclRow,
4419  const size_t col,
4420  const impl_scalar_type& ScalarValue) const
4421  {
4422 #ifdef HAVE_TPETRA_DEBUG
4423  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4424  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4425  TEUCHOS_TEST_FOR_EXCEPTION(
4426  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4427  std::runtime_error,
4428  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4429  << " is invalid. The range of valid row indices on this process "
4430  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4431  << ", " << maxLocalIndex << "].");
4432  TEUCHOS_TEST_FOR_EXCEPTION(
4433  vectorIndexOutOfRange(col),
4434  std::runtime_error,
4435  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4436  << " of the multivector is invalid.");
4437 #endif
4438  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4439  view_.h_view (lclRow, colInd) = ScalarValue;
4440  }
4441 
4442 
4443  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4444  void
4446  sumIntoLocalValue (const LocalOrdinal lclRow,
4447  const size_t col,
4448  const impl_scalar_type& value,
4449  const bool atomic) const
4450  {
4451 #ifdef HAVE_TPETRA_DEBUG
4452  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4453  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4454  TEUCHOS_TEST_FOR_EXCEPTION(
4455  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4456  std::runtime_error,
4457  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4458  << " is invalid. The range of valid row indices on this process "
4459  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4460  << ", " << maxLocalIndex << "].");
4461  TEUCHOS_TEST_FOR_EXCEPTION(
4462  vectorIndexOutOfRange(col),
4463  std::runtime_error,
4464  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4465  << " of the multivector is invalid.");
4466 #endif
4467  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4468  if (atomic) {
4469  Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
4470  }
4471  else {
4472  view_.h_view (lclRow, colInd) += value;
4473  }
4474  }
4475 
4476 
4477  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4478  void
4480  replaceGlobalValue (const GlobalOrdinal gblRow,
4481  const size_t col,
4482  const impl_scalar_type& ScalarValue) const
4483  {
4484  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4485  // touches the RCP's reference count, which isn't thread safe.
4486  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4487 #ifdef HAVE_TPETRA_DEBUG
4488  const char tfecfFuncName[] = "replaceGlobalValue: ";
4489  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4490  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4491  std::runtime_error,
4492  "Global row index " << gblRow << " is not present on this process "
4493  << this->getMap ()->getComm ()->getRank () << ".");
4494  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4495  (this->vectorIndexOutOfRange (col), std::runtime_error,
4496  "Vector index " << col << " of the MultiVector is invalid.");
4497 #endif // HAVE_TPETRA_DEBUG
4498  this->replaceLocalValue (lclRow, col, ScalarValue);
4499  }
4500 
4501  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4502  void
4504  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4505  const size_t col,
4506  const impl_scalar_type& value,
4507  const bool atomic) const
4508  {
4509  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4510  // touches the RCP's reference count, which isn't thread safe.
4511  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4512 #ifdef HAVE_TEUCHOS_DEBUG
4513  TEUCHOS_TEST_FOR_EXCEPTION(
4514  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4515  std::runtime_error,
4516  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4517  << " is not present on this process "
4518  << this->getMap ()->getComm ()->getRank () << ".");
4519  TEUCHOS_TEST_FOR_EXCEPTION(
4520  vectorIndexOutOfRange(col),
4521  std::runtime_error,
4522  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4523  << " of the multivector is invalid.");
4524 #endif
4525  this->sumIntoLocalValue (lclRow, col, value, atomic);
4526  }
4527 
4528  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4529  template <class T>
4530  Teuchos::ArrayRCP<T>
4532  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4533  size_t j) const
4534  {
4535  typedef Kokkos::DualView<impl_scalar_type*,
4536  typename dual_view_type::array_layout,
4537  execution_space> col_dual_view_type;
4538  const size_t col = isConstantStride () ? j : whichVectors_[j];
4539  col_dual_view_type X_col =
4540  Kokkos::subview (view_, Kokkos::ALL (), col);
4541  return Kokkos::Compat::persistingView (X_col.d_view);
4542  }
4543 
4544  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4547  getDualView () const {
4548  return view_;
4549  }
4550 
4551  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4552  std::string
4554  descriptionImpl (const std::string& className) const
4555  {
4556  using Teuchos::TypeNameTraits;
4557 
4558  std::ostringstream out;
4559  out << "\"" << className << "\": {";
4560  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4561  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4562  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4563  << ", Node" << Node::name ()
4564  << "}, ";
4565  if (this->getObjectLabel () != "") {
4566  out << "Label: \"" << this->getObjectLabel () << "\", ";
4567  }
4568  out << ", numRows: " << this->getGlobalLength ();
4569  if (className != "Tpetra::Vector") {
4570  out << ", numCols: " << this->getNumVectors ()
4571  << ", isConstantStride: " << this->isConstantStride ();
4572  }
4573  if (this->isConstantStride ()) {
4574  out << ", columnStride: " << this->getStride ();
4575  }
4576  out << "}";
4577 
4578  return out.str ();
4579  }
4580 
4581  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4582  std::string
4585  {
4586  return this->descriptionImpl ("Tpetra::MultiVector");
4587  }
4588 
4589  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4590  std::string
4592  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4593  {
4594  typedef LocalOrdinal LO;
4595  using std::endl;
4596 
4597  if (vl <= Teuchos::VERB_LOW) {
4598  return std::string ();
4599  }
4600  auto map = this->getMap ();
4601  if (map.is_null ()) {
4602  return std::string ();
4603  }
4604  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4605  auto outp = Teuchos::getFancyOStream (outStringP);
4606  Teuchos::FancyOStream& out = *outp;
4607  auto comm = map->getComm ();
4608  const int myRank = comm->getRank ();
4609  const int numProcs = comm->getSize ();
4610 
4611  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4612  Teuchos::OSTab tab1 (out);
4613 
4614  // VERB_MEDIUM and higher prints getLocalLength()
4615  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4616  out << "Local number of rows: " << lclNumRows << endl;
4617 
4618  if (vl > Teuchos::VERB_MEDIUM) {
4619  // VERB_HIGH and higher prints isConstantStride() and getStride().
4620  // The first is only relevant if the Vector has multiple columns.
4621  if (this->getNumVectors () != static_cast<size_t> (1)) {
4622  out << "isConstantStride: " << this->isConstantStride () << endl;
4623  }
4624  if (this->isConstantStride ()) {
4625  out << "Column stride: " << this->getStride () << endl;
4626  }
4627 
4628  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4629  // VERB_EXTREME prints values. Get a host View of the
4630  // Vector's local data, so we can print it. (Don't assume
4631  // that we can access device data directly in host code.)
4632  auto X_dual = this->getDualView ();
4633  typename dual_view_type::t_host X_host;
4634  if (X_dual.template need_sync<Kokkos::HostSpace> ()) {
4635  // Device memory has the latest version. Don't actually
4636  // sync to host; that changes the Vector's state, and may
4637  // change future computations (that use the data's current
4638  // place to decide where to compute). Instead, create a
4639  // temporary host copy and print that.
4640  auto X_dev = this->template getLocalView<device_type> ();
4641  auto X_host_copy = Kokkos::create_mirror_view (X_dev);
4642  Kokkos::deep_copy (X_host_copy, X_dev);
4643  X_host = X_host_copy;
4644  }
4645  else {
4646  // Either host and device are in sync, or host has the
4647  // latest version of the Vector's data. Thus, we can use
4648  // the host version directly.
4649  X_host = this->template getLocalView<Kokkos::HostSpace> ();
4650  }
4651  // The square braces [] and their contents are in Matlab
4652  // format, so users may copy and paste directly into Matlab.
4653  out << "Values: " << endl
4654  << "[";
4655  const LO numCols = static_cast<LO> (this->getNumVectors ());
4656  if (numCols == 1) {
4657  for (LO i = 0; i < lclNumRows; ++i) {
4658  out << X_host(i,0);
4659  if (i + 1 < lclNumRows) {
4660  out << "; ";
4661  }
4662  }
4663  }
4664  else {
4665  for (LO i = 0; i < lclNumRows; ++i) {
4666  for (LO j = 0; j < numCols; ++j) {
4667  out << X_host(i,j);
4668  if (j + 1 < numCols) {
4669  out << ", ";
4670  }
4671  }
4672  if (i + 1 < lclNumRows) {
4673  out << "; ";
4674  }
4675  }
4676  }
4677  out << "]" << endl;
4678  }
4679  }
4680 
4681  out.flush (); // make sure the ostringstream got everything
4682  return outStringP->str ();
4683  }
4684 
4685  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4686  void
4688  describeImpl (Teuchos::FancyOStream& out,
4689  const std::string& className,
4690  const Teuchos::EVerbosityLevel verbLevel) const
4691  {
4692  using Teuchos::TypeNameTraits;
4693  using Teuchos::VERB_DEFAULT;
4694  using Teuchos::VERB_NONE;
4695  using Teuchos::VERB_LOW;
4696  using std::endl;
4697  const Teuchos::EVerbosityLevel vl =
4698  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4699 
4700  if (vl == VERB_NONE) {
4701  return; // don't print anything
4702  }
4703  // If this Vector's Comm is null, then the Vector does not
4704  // participate in collective operations with the other processes.
4705  // In that case, it is not even legal to call this method. The
4706  // reasonable thing to do in that case is nothing.
4707  auto map = this->getMap ();
4708  if (map.is_null ()) {
4709  return;
4710  }
4711  auto comm = map->getComm ();
4712  if (comm.is_null ()) {
4713  return;
4714  }
4715 
4716  const int myRank = comm->getRank ();
4717 
4718  // Only Process 0 should touch the output stream, but this method
4719  // in general may need to do communication. Thus, we may need to
4720  // preserve the current tab level across multiple "if (myRank ==
4721  // 0) { ... }" inner scopes. This is why we sometimes create
4722  // OSTab instances by pointer, instead of by value. We only need
4723  // to create them by pointer if the tab level must persist through
4724  // multiple inner scopes.
4725  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4726 
4727  // VERB_LOW and higher prints the equivalent of description().
4728  if (myRank == 0) {
4729  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4730  out << "\"" << className << "\":" << endl;
4731  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4732  {
4733  out << "Template parameters:" << endl;
4734  Teuchos::OSTab tab2 (out);
4735  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4736  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4737  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4738  << "Node: " << Node::name () << endl;
4739  }
4740  if (this->getObjectLabel () != "") {
4741  out << "Label: \"" << this->getObjectLabel () << "\", ";
4742  }
4743  out << "Global number of rows: " << this->getGlobalLength () << endl;
4744  if (className != "Tpetra::Vector") {
4745  out << "Number of columns: " << this->getNumVectors () << endl;
4746  }
4747  // getStride() may differ on different processes, so it (and
4748  // isConstantStride()) properly belong to per-process data.
4749  }
4750 
4751  // This is collective over the Map's communicator.
4752  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4753  const std::string lclStr = this->localDescribeToString (vl);
4754  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4755  }
4756  }
4757 
4758  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4759  void
4761  describe (Teuchos::FancyOStream &out,
4762  const Teuchos::EVerbosityLevel verbLevel) const
4763  {
4764  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4765  }
4766 
4767  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4768  void
4770  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4771  {
4772  replaceMap (newMap);
4773  }
4774 
4775  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4776  void
4779  {
4780  using LO = LocalOrdinal;
4781  using DT = device_type;
4782  using HMDT = typename dual_view_type::host_mirror_space::device_type;
4783  using ::Tpetra::Details::localDeepCopy;
4784  using ::Tpetra::Details::localDeepCopyConstStride;
4785  const char prefix[] = "Tpetra::deep_copy (MultiVector): ";
4786  const bool debug = false;
4787 
4788  TEUCHOS_TEST_FOR_EXCEPTION
4789  (this->getGlobalLength () != src.getGlobalLength () ||
4790  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4791  prefix << "Global dimensions of the two Tpetra::MultiVector "
4792  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4793  << "," << src.getNumVectors () << "], and *this has dimensions ["
4794  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4795  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4796  TEUCHOS_TEST_FOR_EXCEPTION
4797  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4798  prefix << "The local row counts of the two Tpetra::MultiVector "
4799  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4800  << " and *this has " << this->getLocalLength () << " row(s).");
4801 
4802  // See #1510. We're writing to *this, so we don't care about its
4803  // contents in either memory space, and we don't want
4804  // DualView::modify to complain about "concurrent modification" of
4805  // host and device Views.
4806  this->view_.modified_device () = 0;
4807  this->view_.modified_host () = 0;
4808 
4809  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4810  std::cout << "*** MultiVector::assign: ";
4811  }
4812 
4813  if (src.isConstantStride () && this->isConstantStride ()) {
4814  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4815  std::cout << "Both *this and src have constant stride" << std::endl;
4816  }
4817 
4818  // If we need sync to device, then host has the most recent version.
4819  const bool useHostVersion = src.template need_sync<device_type> ();
4820 
4821  if (useHostVersion) { // Host memory has the most recent version of src.
4822  // Copy from src to dst on host.
4823  this->template modify<HMDT> ();
4824  localDeepCopyConstStride (this->template getLocalView<HMDT> (),
4825  src.template getLocalView<HMDT> ());
4826  }
4827  else { // Device memory has the most recent version of src.
4828  // Copy from src to dst on device.
4829  this->template modify<DT> ();
4830  localDeepCopyConstStride (this->template getLocalView<DT> (),
4831  src.template getLocalView<DT> ());
4832  }
4833  }
4834  else {
4835  if (this->isConstantStride ()) {
4836  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4837  std::cout << "Only *this has constant stride";
4838  }
4839 
4840  const LO numWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4841  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4842 
4843  // We can't sync src, since it is only an input argument.
4844  // Thus, we have to use the most recently modified version of
4845  // src, device or host.
4846  //
4847  // If we need sync to device, then host has the most recent version.
4848  const bool useHostVersion = src.template need_sync<device_type> ();
4849  if (useHostVersion) { // host version of src most recently modified
4850  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4851  std::cout << "; Copy from host version of src" << std::endl;
4852  }
4853  // Copy from the host version of src.
4854  //
4855  // whichVecs tells the kernel which vectors (columns) of src
4856  // to copy. Fill whichVecs on the host, and use it there.
4857  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4858  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4859  for (LO i = 0; i < numWhichVecs; ++i) {
4860  srcWhichVecs(i) = static_cast<LO> (src.whichVectors_[i]);
4861  }
4862  // Copy from the selected vectors of src to dst, on the
4863  // host. The function ignores its dstWhichVecs argument in
4864  // this case.
4865  this->template modify<HMDT> ();
4866  localDeepCopy (this->template getLocalView<HMDT> (),
4867  src.template getLocalView<HMDT> (),
4868  true, false, srcWhichVecs, srcWhichVecs);
4869  }
4870  else { // copy from the device version of src
4871  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4872  std::cout << "; Copy from device version of src" << std::endl;
4873  }
4874  // Copy from the device version of src.
4875  //
4876  // whichVecs tells the kernel which vectors (columns) of src
4877  // to copy. Fill whichVecs on the host, and sync to device.
4878  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4879  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4880  srcWhichVecs.template modify<HMDT> ();
4881  for (LO i = 0; i < numWhichVecs; ++i) {
4882  srcWhichVecs.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
4883  }
4884  // Sync the host version of srcWhichVecs to the device.
4885  srcWhichVecs.template sync<DT> ();
4886 
4887  // Copy from the selected vectors of src to dst, on the
4888  // device. The function ignores its dstWhichVecs argument
4889  // in this case.
4890  this->template modify<DT> ();
4891  localDeepCopy (this->template getLocalView<DT> (),
4892  src.template getLocalView<DT> (),
4893  true, false, srcWhichVecs.d_view,
4894  srcWhichVecs.d_view);
4895  }
4896 
4897  }
4898  else { // dst is NOT constant stride
4899  if (src.isConstantStride ()) {
4900  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4901  std::cout << "Only src has constant stride" << std::endl;
4902  }
4903 
4904  // If we need sync to device, then host has the most recent version.
4905  const bool useHostVersion = src.template need_sync<device_type> ();
4906  if (useHostVersion) { // src most recently modified on host
4907  // Copy from the host version of src.
4908  //
4909  // whichVecs tells the kernel which vectors (columns) of src
4910  // to copy. Fill whichVecs on the host, and use it there.
4911  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4912  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4913  whichvecs_type whichVecs ("MV::deep_copy::whichVecs", numWhichVecs);
4914  for (LO i = 0; i < numWhichVecs; ++i) {
4915  whichVecs(i) = static_cast<LO> (this->whichVectors_[i]);
4916  }
4917  // Copy from src to the selected vectors of dst, on the
4918  // host. The functor ignores its 4th arg in this case.
4919  this->template modify<HMDT> ();
4920  localDeepCopy (this->template getLocalView<HMDT> (),
4921  src.template getLocalView<HMDT> (),
4922  this->isConstantStride (),
4923  src.isConstantStride (),
4924  whichVecs, whichVecs);
4925  }
4926  else { // Copy from the device version of src.
4927  // whichVecs tells the kernel which vectors (columns) of dst
4928  // to copy. Fill whichVecs on the host, and sync to device.
4929  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4930  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4931  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4932  whichvecs_type whichVecs (whichVecsLabel, numWhichVecs);
4933  whichVecs.template modify<HMDT> ();
4934  for (LO i = 0; i < numWhichVecs; ++i) {
4935  whichVecs.h_view(i) = this->whichVectors_[i];
4936  }
4937  // Sync the host version of whichVecs to the device.
4938  whichVecs.template sync<DT> ();
4939 
4940  // Copy src to the selected vectors of dst, on the device.
4941  this->template modify<DT> ();
4942  localDeepCopy (this->template getLocalView<DT> (),
4943  src.template getLocalView<DT> (),
4944  this->isConstantStride (),
4945  src.isConstantStride (),
4946  whichVecs.d_view,
4947  whichVecs.d_view);
4948  }
4949  }
4950  else { // neither src nor dst have constant stride
4951  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4952  std::cout << "Neither *this nor src has constant stride" << std::endl;
4953  }
4954 
4955  // If we need sync to device, then host has the most recent version.
4956  const bool useHostVersion = src.template need_sync<device_type> ();
4957  if (useHostVersion) { // copy from the host version of src
4958  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4959  Kokkos::View<LO*, HMDT> whichVectorsDst ("dstWhichVecs", dstNumWhichVecs);
4960  for (LO i = 0; i < dstNumWhichVecs; ++i) {
4961  whichVectorsDst(i) = this->whichVectors_[i];
4962  }
4963 
4964  // Use the destination MultiVector's LocalOrdinal type here.
4965  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4966  Kokkos::View<LO*, HMDT> whichVectorsSrc ("srcWhichVecs", srcNumWhichVecs);
4967  for (LO i = 0; i < srcNumWhichVecs; ++i) {
4968  whichVectorsSrc(i) = src.whichVectors_[i];
4969  }
4970 
4971  // Copy from the selected vectors of src to the selected
4972  // vectors of dst, on the host.
4973  this->template modify<HMDT> ();
4974  localDeepCopy (this->template getLocalView<HMDT> (),
4975  src.template getLocalView<HMDT> (),
4976  this->isConstantStride (),
4977  src.isConstantStride (),
4978  whichVectorsDst,
4979  whichVectorsSrc);
4980  }
4981  else { // copy from the device version of src
4982  // whichVectorsDst tells the kernel which columns of dst
4983  // to copy. Fill it on the host, and sync to device.
4984  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4985  Kokkos::DualView<LO*, DT> whichVecsDst ("MV::deep_copy::whichVecsDst",
4986  dstNumWhichVecs);
4987  whichVecsDst.template modify<HMDT> ();
4988  for (LO i = 0; i < dstNumWhichVecs; ++i) {
4989  whichVecsDst.h_view(i) = static_cast<LO> (this->whichVectors_[i]);
4990  }
4991  // Sync the host version of whichVecsDst to the device.
4992  whichVecsDst.template sync<DT> ();
4993 
4994  // whichVectorsSrc tells the kernel which vectors
4995  // (columns) of src to copy. Fill it on the host, and
4996  // sync to device. Use the destination MultiVector's
4997  // LocalOrdinal type here.
4998  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4999  Kokkos::DualView<LO*, DT> whichVecsSrc ("MV::deep_copy::whichVecsSrc",
5000  srcNumWhichVecs);
5001  whichVecsSrc.template modify<HMDT> ();
5002  for (LO i = 0; i < srcNumWhichVecs; ++i) {
5003  whichVecsSrc.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
5004  }
5005  // Sync the host version of whichVecsSrc to the device.
5006  whichVecsSrc.template sync<DT> ();
5007 
5008  // Copy from the selected vectors of src to the selected
5009  // vectors of dst, on the device.
5010  this->template modify<DT> ();
5011  localDeepCopy (this->template getLocalView<DT> (),
5012  src.template getLocalView<DT> (),
5013  this->isConstantStride (),
5014  src.isConstantStride (),
5015  whichVecsDst.d_view,
5016  whichVecsSrc.d_view);
5017  }
5018  }
5019  }
5020  }
5021  }
5022 
5023  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5024  bool
5027  using ::Tpetra::Details::PackTraits;
5028  using ST = impl_scalar_type;
5029  using HES = typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
5030 
5031  const size_t l1 = this->getLocalLength();
5032  const size_t l2 = vec.getLocalLength();
5033  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
5034  return false;
5035  }
5036  if (l1==0) {
5037  return true;
5038  }
5039 
5040  auto v1 = this->template getLocalView<HES>();
5041  auto v2 = vec.template getLocalView<HES>();
5042  if (PackTraits<ST, HES>::packValueCount (v1(0,0)) !=
5043  PackTraits<ST, HES>::packValueCount (v2(0,0))) {
5044  return false;
5045  }
5046 
5047  return true;
5048  }
5049 
5050  template <class ST, class LO, class GO, class NT>
5053  // Cache maps & views
5054  Teuchos::RCP<const map_type> map = mv.map_;
5055  dual_view_type view, origView;
5056  Teuchos::Array<size_t> whichVectors; // FIXME: This is a deep copy
5057  view = mv.view_;
5058  origView = mv.origView_;
5059  whichVectors = mv.whichVectors_;
5060 
5061  // Swap this-> mv
5062  mv.map_ = this->map_;
5063  mv.view_ = this->view_;
5064  mv.origView_ = this->origView_;
5065  mv.whichVectors_ = this->whichVectors_;
5066 
5067  // Swap mv -> this
5068  this->map_ = map;
5069  this->view_ = view;
5070  this->origView_ = origView;
5071  this->whichVectors_ = whichVectors;
5072  }
5073 } // namespace Classes
5074 
5075  template <class Scalar, class LO, class GO, class NT>
5076  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
5077  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
5078  size_t numVectors)
5079  {
5081  return Teuchos::rcp (new MV (map, numVectors));
5082  }
5083 
5084  template <class ST, class LO, class GO, class NT>
5085  MultiVector<ST, LO, GO, NT>
5087  {
5088  typedef MultiVector<ST, LO, GO, NT> MV;
5089  MV cpy (src.getMap (), src.getNumVectors (), false);
5090  cpy.assign (src);
5091  return cpy;
5092  }
5093 
5094 } // namespace Tpetra
5095 
5096 //
5097 // Explicit instantiation macro
5098 //
5099 // Must be expanded from within the Tpetra namespace!
5100 //
5101 
5102 #define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
5103  namespace Classes { template class MultiVector< SCALAR , LO , GO , NODE >; } \
5104  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
5105  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors);
5106 
5107 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Tpetra::Classes::MultiVector::isConstantStride
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Definition: Tpetra_MultiVector_def.hpp:741
Tpetra::Classes::DistObject< Scalar, LocalOrdinal, GlobalOrdinal, Node >::exports_
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Definition: Tpetra_DistObject_decl.hpp:990
Tpetra::Classes::MultiVector::getLocalLength
size_t getLocalLength() const
Local number of rows on the calling process.
Definition: Tpetra_MultiVector_def.hpp:748
Tpetra::Classes::MultiVector::whichVectors_
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Definition: Tpetra_MultiVector_decl.hpp:2259
Tpetra::Classes::DistObject< Scalar, LocalOrdinal, GlobalOrdinal, Node >::map_
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
Definition: Tpetra_DistObject_decl.hpp:942
Tpetra::REPLACE
Replace existing values with new values.
Definition: Tpetra_CombineMode.hpp:97
Tpetra::Classes::@199::EWhichNormImpl
EWhichNormImpl
Input argument for localNormImpl() (which see).
Definition: Tpetra_MultiVector_def.hpp:2146
Tpetra_Details_isInterComm.hpp
Declaration of Tpetra::Details::isInterComm.
Tpetra_Details_Behavior.hpp
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Tpetra_Details_PackTraits.hpp
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Tpetra::Classes::MultiVector::view_
dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
Definition: Tpetra_MultiVector_decl.hpp:2214
Tpetra::Classes::DistObject< Scalar, LO, GO, Node >::device_type
Node::device_type device_type
The Kokkos Device type.
Definition: Tpetra_DistObject_decl.hpp:370
Tpetra::MultiVector
Classes::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > MultiVector
Alias for Tpetra::Classes::MultiVector.
Definition: Tpetra_MultiVector_fwd.hpp:72
Tpetra::Details::ProfilingRegion
Profile the given scope.
Definition: Tpetra_Details_Profiling.hpp:100
Tpetra::Classes::MultiVector::dual_view_type
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, typename execution_space::execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
Definition: Tpetra_MultiVector_decl.hpp:476
Tpetra::Details::getDualViewCopyFromArrayView
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
Definition: Tpetra_Util.hpp:912
Tpetra::Details::Behavior::debug
static bool debug()
Whether Tpetra is in debug mode.
Definition: Tpetra_Details_Behavior.cpp:245
Tpetra::Classes::DistObject< Scalar, LocalOrdinal, GlobalOrdinal, Node >::getMap
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Definition: Tpetra_DistObject_decl.hpp:510
Tpetra::Classes::MultiVector::getStride
size_t getStride() const
Stride between columns in the multivector.
Definition: Tpetra_MultiVector_def.hpp:772
Tpetra::ADD
Sum new values into existing values.
Definition: Tpetra_CombineMode.hpp:95
Tpetra::Classes::MultiVector< Scalar, LO, GO, Node >::EWhichNorm
EWhichNorm
Input argument for normImpl() (which see).
Definition: Tpetra_MultiVector_decl.hpp:2268
Tpetra::ABSMAX
Replace old value with maximum of magnitudes of old and new values.
Definition: Tpetra_CombineMode.hpp:98
Tpetra::Classes::DistObject< Scalar, LO, GO, Node >
Tpetra::Classes::MultiVector::MultiVector
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
Definition: Tpetra_MultiVector_def.hpp:313
Tpetra_Details_Profiling.hpp
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Tpetra::createMultiVector
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
Tpetra::Classes::MultiVector< Scalar, LO, GO, Node >::mag_type
Kokkos::Details::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
Definition: Tpetra_MultiVector_decl.hpp:443
Tpetra_Details_castAwayConstDualView.hpp
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Tpetra::Classes::DistObject< Scalar, LocalOrdinal, GlobalOrdinal, Node >::imports_
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Definition: Tpetra_DistObject_decl.hpp:951
Tpetra_Util.hpp
Stand-alone utility functions and macros.
Tpetra::Classes::MultiVector
One or more distributed dense vectors.
Definition: Tpetra_MultiVector_decl.hpp:389
Tpetra::Classes::MultiVector::dot_type
Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Definition: Tpetra_MultiVector_decl.hpp:435
Tpetra::Classes::MultiVector::origView_
dual_view_type origView_
The "original view" of the MultiVector's data.
Definition: Tpetra_MultiVector_decl.hpp:2245
Tpetra_Details_reallocDualViewIfNeeded.hpp
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Tpetra::removeEmptyProcessesInPlace
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object's Map.
Definition: Tpetra_DistObject_def.hpp:1643
Tpetra::Classes::MultiVector::getNumVectors
size_t getNumVectors() const
Number of columns in the multivector.
Definition: Tpetra_MultiVector_def.hpp:1739
Tpetra::createCopy
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
Definition: Tpetra_MultiVector_def.hpp:5086
Tpetra::Classes::Map
A parallel distribution of indices over processes.
Definition: Tpetra_Map_decl.hpp:247
Tpetra::Classes::MultiVector< Scalar, LO, GO, Node >::impl_scalar_type
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Definition: Tpetra_MultiVector_decl.hpp:414
Tpetra::Classes::DistObject< Scalar, LO, GO, Node >::execution_space
device_type::execution_space execution_space
The Kokkos execution space.
Definition: Tpetra_DistObject_decl.hpp:372
Tpetra::Classes::DistObject< Scalar, LocalOrdinal, GlobalOrdinal, Node >::numImportPacketsPerLID_
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Definition: Tpetra_DistObject_decl.hpp:983
Tpetra::Classes::Map::getNodeNumElements
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
Definition: Tpetra_Map_decl.hpp:578
Tpetra_Details_lclDot.hpp
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Tpetra::global_size_t
size_t global_size_t
Global size_t object.
Definition: Tpetra_ConfigDefs.hpp:109
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Tpetra::deep_copy
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Definition: Tpetra_MultiVector_decl.hpp:2453
Tpetra::Details::castAwayConstDualView
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
Definition: Tpetra_Details_castAwayConstDualView.hpp:64
Tpetra::SrcDistObject
Abstract base class for objects that can be the source of an Import or Export operation.
Definition: Tpetra_SrcDistObject.hpp:89
Tpetra::Classes::MultiVector::getOrigNumLocalRows
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Definition: Tpetra_MultiVector_def.hpp:3332
Tpetra::Classes::MultiVector::getGlobalLength
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Definition: Tpetra_MultiVector_def.hpp:760
Tpetra::Classes::DistObject< Scalar, LocalOrdinal, GlobalOrdinal, Node >::numExportPacketsPerLID_
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Definition: Tpetra_DistObject_decl.hpp:1005
Tpetra::Classes::DistObject< Scalar, LocalOrdinal, GlobalOrdinal, Node >::isDistributed
bool isDistributed() const
Whether this is a globally distributed object.
Definition: Tpetra_DistObject_def.hpp:420
Tpetra::Classes::MultiVector::assign
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Definition: Tpetra_MultiVector_def.hpp:4778
Tpetra::INSERT
Insert new values that don't currently exist.
Definition: Tpetra_CombineMode.hpp:96
Tpetra_Details_fill.hpp
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Tpetra::Details::reallocDualViewIfNeeded
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Definition: Tpetra_Details_reallocDualViewIfNeeded.hpp:83
Tpetra::CombineMode
CombineMode
Rule for combining data in an Import or Export.
Definition: Tpetra_CombineMode.hpp:94
Tpetra::Classes::Vector
A distributed dense vector.
Definition: Tpetra_Vector_decl.hpp:82