Tpetra parallel linear algebra  Version of the Day
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
44 
47 
50 #include "Teuchos_TimeMonitor.hpp"
51 #ifdef HAVE_TPETRA_DEBUG
52 # include <set>
53 #endif // HAVE_TPETRA_DEBUG
54 
55 //
56 // mfh 25 May 2016: Temporary fix for #393.
57 //
58 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
59 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
60 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
61 // use them in that case, either.
62 //
63 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
64 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
65 // thing for any GCC version.
66 //
67 #if defined(__CUDACC__)
68  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
69 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
70 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
71 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72 
73 #elif defined(__GNUC__)
74 
75 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
76 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
77 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78 
79 #else // some other compiler
80 
81  // Optimistically assume that other compilers aren't broken.
82 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
83 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
84 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85 #endif // defined(__CUDACC__), defined(__GNUC__)
86 
87 
88 namespace Tpetra {
89 namespace Experimental {
90 namespace Classes {
91 
92 namespace Impl {
93 
94 #if 0
95 template<class AlphaCoeffType,
96  class GraphType,
97  class MatrixValuesType,
98  class InVecType,
99  class BetaCoeffType,
100  class OutVecType>
101 class BcrsApplyNoTrans1VecTeamFunctor {
102 private:
103  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
104  "MatrixValuesType must be a Kokkos::View.");
105  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
106  "OutVecType must be a Kokkos::View.");
107  static_assert (Kokkos::Impl::is_view<InVecType>::value,
108  "InVecType must be a Kokkos::View.");
109  static_assert (std::is_same<MatrixValuesType,
110  typename MatrixValuesType::const_type>::value,
111  "MatrixValuesType must be a const Kokkos::View.");
112  static_assert (std::is_same<OutVecType,
113  typename OutVecType::non_const_type>::value,
114  "OutVecType must be a nonconst Kokkos::View.");
115  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
116  "InVecType must be a const Kokkos::View.");
117  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
118  "MatrixValuesType must be a rank-1 Kokkos::View.");
119  static_assert (static_cast<int> (InVecType::rank) == 1,
120  "InVecType must be a rank-1 Kokkos::View.");
121  static_assert (static_cast<int> (OutVecType::rank) == 1,
122  "OutVecType must be a rank-1 Kokkos::View.");
123  typedef typename MatrixValuesType::non_const_value_type scalar_type;
124  typedef typename GraphType::device_type device_type;
125  typedef typename device_type::execution_space execution_space;
126  typedef typename execution_space::scratch_memory_space shmem_space;
127 
128 public:
130  typedef typename std::remove_const<typename GraphType::data_type>::type
133  typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,
134  execution_space,
135  local_ordinal_type> policy_type;
142  void setX (const InVecType& X) { X_ = X; }
143 
150  void setY (const OutVecType& Y) { Y_ = Y; }
151 
155  KOKKOS_INLINE_FUNCTION local_ordinal_type
156  getScratchSizePerTeam () const
157  {
158  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
159  // that have run-time sizes, like some of those in Stokhos.
160  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
161  return blockSize_ * sizeof (y_value_type);
162  }
163 
167  KOKKOS_INLINE_FUNCTION local_ordinal_type
168  getScratchSizePerThread () const
169  {
170  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
171  // that have run-time sizes, like some of those in Stokhos.
172  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
173  return blockSize_ * sizeof (y_value_type);
174  }
175 
176 private:
177  KOKKOS_INLINE_FUNCTION local_ordinal_type
178  getNumLclMeshRows () const
179  {
180  return ptr_.extent (0) == 0 ?
181  static_cast<local_ordinal_type> (0) :
182  static_cast<local_ordinal_type> (ptr_.extent (0) - 1);
183  }
184 
185  static constexpr local_ordinal_type defaultRowsPerTeam = 20;
186 
187 public:
189  local_ordinal_type getNumTeams () const {
190  return 1;
191  // const local_ordinal_type numLclMeshRows = getNumLclMeshRows ();
192  // return (numLclMeshRows + rowsPerTeam_ - 1) / rowsPerTeam_;
193  }
194 
196  BcrsApplyNoTrans1VecTeamFunctor (const typename std::decay<AlphaCoeffType>::type& alpha,
197  const GraphType& graph,
198  const MatrixValuesType& val,
199  const local_ordinal_type blockSize,
200  const InVecType& X,
201  const typename std::decay<BetaCoeffType>::type& beta,
202  const OutVecType& Y,
203  const local_ordinal_type rowsPerTeam = defaultRowsPerTeam) :
204  alpha_ (alpha),
205  ptr_ (graph.row_map),
206  ind_ (graph.entries),
207  val_ (val),
208  blockSize_ (blockSize),
209  X_ (X),
210  beta_ (beta),
211  Y_ (Y),
212  rowsPerTeam_ (rowsPerTeam)
213  {
214  // std::ostringstream os;
215  // os << "Functor ctor: numLclMeshRows = " << getNumLclMeshRows () << std::endl;
216  // std::cerr << os.str ();
217  }
218 
219  KOKKOS_INLINE_FUNCTION void
220  operator () (const typename policy_type::member_type& member) const
221  {
226  using Kokkos::Details::ArithTraits;
227  // I'm not writing 'using Kokkos::make_pair;' here, because that
228  // may break builds for users who make the mistake of putting
229  // 'using namespace std;' in the global namespace. Please don't
230  // ever do that! But just in case you do, I'll take this
231  // precaution.
232  using Kokkos::parallel_for;
233  using Kokkos::subview;
234  typedef local_ordinal_type LO;
235  typedef typename decltype (ptr_)::non_const_value_type offset_type;
236  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
237  shmem_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
238  shared_array_type;
239  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
240  Kokkos::LayoutRight,
241  device_type,
242  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
243  out_little_vec_type;
244  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
245  Kokkos::LayoutRight,
246  device_type,
247  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
248  little_block_type;
249 
250  const LO leagueRank = member.league_rank();
251 
252  // This looks wrong! All the threads are writing into the same local storage.
253  // shared_array_type threadLocalMem =
254  // shared_array_type (member.thread_scratch (1), blockSize_);
255  shared_array_type threadLocalMem =
256  shared_array_type (member.thread_scratch (1), blockSize_ * rowsPerTeam_);
257 
258  // This looks wrong! All the threads are writing into the same local storage.
259  //out_little_vec_type Y_tlm (threadLocalMem.data (), blockSize_, 1);
260 
261  const LO numLclMeshRows = getNumLclMeshRows ();
262  const LO rowBeg = leagueRank * rowsPerTeam_;
263  const LO rowTmp = rowBeg + rowsPerTeam_;
264  const LO rowEnd = rowTmp < numLclMeshRows ? rowTmp : numLclMeshRows;
265 
266  // {
267  // std::ostringstream os;
268  // os << leagueRank << "," << member.team_rank () << ": "
269  // << rowBeg << "," << rowEnd << std::endl;
270  // std::cerr << os.str ();
271  // }
272 
273  // Each team takes rowsPerTeam_ (block) rows.
274  // Each thread in the team takes a single (block) row.
275  parallel_for (Kokkos::TeamThreadRange (member, rowBeg, rowEnd),
276  [&] (const LO& lclRow) {
277  // Each thread in the team gets its own temp storage.
278  out_little_vec_type Y_tlm (threadLocalMem.data () + blockSize_ * member.team_rank (), blockSize_);
279 
280  const offset_type Y_ptBeg = lclRow * blockSize_;
281  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
282  auto Y_cur =
283  subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
284  if (beta_ == ArithTraits<BetaCoeffType>::zero ()) {
285  FILL (Y_tlm, ArithTraits<BetaCoeffType>::zero ());
286  }
287  else if (beta_ == ArithTraits<BetaCoeffType>::one ()) {
288  COPY (Y_cur, Y_tlm);
289  }
290  else { // beta != 0 && beta != 1
291  COPY (Y_cur, Y_tlm);
292  SCAL (beta_, Y_tlm);
293  }
294 
295  if (alpha_ != ArithTraits<AlphaCoeffType>::zero ()) {
296  const offset_type blkBeg = ptr_[lclRow];
297  const offset_type blkEnd = ptr_[lclRow+1];
298  // Precompute to save integer math in the inner loop.
299  const offset_type bs2 = blockSize_ * blockSize_;
300  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
301  ++absBlkOff) {
302  little_block_type A_cur (val_.data () + absBlkOff * bs2,
303  blockSize_, blockSize_);
304  const offset_type X_blkCol = ind_[absBlkOff];
305  const offset_type X_ptBeg = X_blkCol * blockSize_;
306  const offset_type X_ptEnd = X_ptBeg + blockSize_;
307  auto X_cur =
308  subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
309  // Y_tlm += alpha*A_cur*X_cur
310  GEMV (alpha_, A_cur, X_cur, Y_tlm);
311  } // for each entry in current local block row of matrix
312  COPY (Y_tlm, Y_cur);
313  }
314  });
315  }
316 
317 private:
318  typename std::decay<AlphaCoeffType>::type alpha_;
319  typename GraphType::row_map_type::const_type ptr_;
320  typename GraphType::entries_type::const_type ind_;
321  MatrixValuesType val_;
322  local_ordinal_type blockSize_;
323  InVecType X_;
324  typename std::decay<BetaCoeffType>::type beta_;
325  OutVecType Y_;
326  local_ordinal_type rowsPerTeam_;
327 };
328 #endif // 0
329 
330 template<class AlphaCoeffType,
331  class GraphType,
332  class MatrixValuesType,
333  class InVecType,
334  class BetaCoeffType,
335  class OutVecType>
336 class BcrsApplyNoTrans1VecFunctor {
337 private:
338  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
339  "MatrixValuesType must be a Kokkos::View.");
340  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
341  "OutVecType must be a Kokkos::View.");
342  static_assert (Kokkos::Impl::is_view<InVecType>::value,
343  "InVecType must be a Kokkos::View.");
344  static_assert (std::is_same<MatrixValuesType,
345  typename MatrixValuesType::const_type>::value,
346  "MatrixValuesType must be a const Kokkos::View.");
347  static_assert (std::is_same<OutVecType,
348  typename OutVecType::non_const_type>::value,
349  "OutVecType must be a nonconst Kokkos::View.");
350  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
351  "InVecType must be a const Kokkos::View.");
352  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
353  "MatrixValuesType must be a rank-1 Kokkos::View.");
354  static_assert (static_cast<int> (InVecType::rank) == 1,
355  "InVecType must be a rank-1 Kokkos::View.");
356  static_assert (static_cast<int> (OutVecType::rank) == 1,
357  "OutVecType must be a rank-1 Kokkos::View.");
358  typedef typename MatrixValuesType::non_const_value_type scalar_type;
359 
360 public:
361  typedef typename GraphType::device_type device_type;
362 
364  typedef typename std::remove_const<typename GraphType::data_type>::type
367  typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,
368  typename device_type::execution_space,
369  local_ordinal_type> policy_type;
376  void setX (const InVecType& X) { X_ = X; }
377 
384  void setY (const OutVecType& Y) { Y_ = Y; }
385 
386  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
387  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
388 
390  BcrsApplyNoTrans1VecFunctor (const alpha_coeff_type& alpha,
391  const GraphType& graph,
392  const MatrixValuesType& val,
393  const local_ordinal_type blockSize,
394  const InVecType& X,
395  const beta_coeff_type& beta,
396  const OutVecType& Y) :
397  alpha_ (alpha),
398  ptr_ (graph.row_map),
399  ind_ (graph.entries),
400  val_ (val),
401  blockSize_ (blockSize),
402  X_ (X),
403  beta_ (beta),
404  Y_ (Y)
405  {}
406 
407  KOKKOS_INLINE_FUNCTION void
408  operator () (const local_ordinal_type& lclRow) const
409  {
414  using Kokkos::Details::ArithTraits;
415  // I'm not writing 'using Kokkos::make_pair;' here, because that
416  // may break builds for users who make the mistake of putting
417  // 'using namespace std;' in the global namespace. Please don't
418  // ever do that! But just in case you do, I'll take this
419  // precaution.
420  using Kokkos::parallel_for;
421  using Kokkos::subview;
422  typedef typename decltype (ptr_)::non_const_value_type offset_type;
423  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
424  Kokkos::LayoutRight,
425  device_type,
426  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
427  little_block_type;
428 
429  const offset_type Y_ptBeg = lclRow * blockSize_;
430  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
431  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
432 
433  // This version of the code does not use temporary storage.
434  // Each thread writes to its own block of the target vector.
435  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
436  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
437  }
438  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
439  SCAL (beta_, Y_cur);
440  }
441 
442  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
443  const offset_type blkBeg = ptr_[lclRow];
444  const offset_type blkEnd = ptr_[lclRow+1];
445  // Precompute to save integer math in the inner loop.
446  const offset_type bs2 = blockSize_ * blockSize_;
447  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
448  ++absBlkOff) {
449  little_block_type A_cur (val_.data () + absBlkOff * bs2,
450  blockSize_, blockSize_);
451  const offset_type X_blkCol = ind_[absBlkOff];
452  const offset_type X_ptBeg = X_blkCol * blockSize_;
453  const offset_type X_ptEnd = X_ptBeg + blockSize_;
454  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
455 
456  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
457  } // for each entry in current local block row of matrix
458  }
459  }
460 
461 private:
462  alpha_coeff_type alpha_;
463  typename GraphType::row_map_type::const_type ptr_;
464  typename GraphType::entries_type::const_type ind_;
465  MatrixValuesType val_;
466  local_ordinal_type blockSize_;
467  InVecType X_;
468  beta_coeff_type beta_;
469  OutVecType Y_;
470 };
471 
472 template<class AlphaCoeffType,
473  class GraphType,
474  class MatrixValuesType,
475  class InMultiVecType,
476  class BetaCoeffType,
477  class OutMultiVecType>
478 void
479 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
480  const GraphType& graph,
481  const MatrixValuesType& val,
482  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
483  const InMultiVecType& X,
484  const BetaCoeffType& beta,
485  const OutMultiVecType& Y
486 #if 0
487  , const typename std::remove_const<typename GraphType::data_type>::type rowsPerTeam = 20
488 #endif // 0
489  )
490 {
491  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
492  "MatrixValuesType must be a Kokkos::View.");
493  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
494  "OutMultiVecType must be a Kokkos::View.");
495  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
496  "InMultiVecType must be a Kokkos::View.");
497  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
498  "MatrixValuesType must be a rank-1 Kokkos::View.");
499  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
500  "OutMultiVecType must be a rank-2 Kokkos::View.");
501  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
502  "InMultiVecType must be a rank-2 Kokkos::View.");
503 
504  typedef typename MatrixValuesType::const_type matrix_values_type;
505  typedef typename OutMultiVecType::non_const_type out_multivec_type;
506  typedef typename InMultiVecType::const_type in_multivec_type;
507  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
508  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
509  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
510 
511  const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
512  static_cast<LO> (0) :
513  static_cast<LO> (graph.row_map.extent (0) - 1);
514  const LO numVecs = Y.extent (1);
515  if (numLocalMeshRows == 0 || numVecs == 0) {
516  return; // code below doesn't handle numVecs==0 correctly
517  }
518 
519  // These assignments avoid instantiating the functor extra times
520  // unnecessarily, e.g., for X const vs. nonconst. We only need the
521  // X const case, so only instantiate for that case.
522  in_multivec_type X_in = X;
523  out_multivec_type Y_out = Y;
524 
525  // The functor only knows how to handle one vector at a time, and it
526  // expects 1-D Views. Thus, we need to know the type of each column
527  // of X and Y.
528  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
529  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
530 #if 0
531  typedef BcrsApplyNoTrans1VecTeamFunctor<alpha_type, GraphType,
532  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
533 #else
534  typedef BcrsApplyNoTrans1VecFunctor<alpha_type, GraphType,
535  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
536 #endif // 0
537  typedef typename functor_type::policy_type policy_type;
538 
539  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
540  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
541 #if 0
542  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0, rowsPerTeam);
543  const LO numTeams = functor.getNumTeams ();
544  policy_type policy (numTeams, Kokkos::AUTO ());
545  {
546  // KJ : hierarchy level of memory allocated e.g., cache (1),
547  // HBM (2), DDR (3), not used for now
548  const LO level = 1;
549  // KJ : for now provide two options for parallelizing (for vs. reduce)
550  const LO scratchSizePerTeam = functor.getScratchSizePerTeam (); // used for team parallel_red
551  const LO scratchSizePerThread = functor.getScratchSizePerThread (); // used for team parallel_for
552  policy =
553  policy.set_scratch_size (level,
554  Kokkos::PerTeam (scratchSizePerTeam),
555  Kokkos::PerThread (scratchSizePerThread));
556  }
557 #else
558  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
559  policy_type policy (0, numLocalMeshRows);
560 #endif // 0
561 
562  // Compute the first column of Y.
563  Kokkos::parallel_for (policy, functor);
564 
565  // Compute the remaining columns of Y.
566  for (LO j = 1; j < numVecs; ++j) {
567  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
568  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
569  functor.setX (X_j);
570  functor.setY (Y_j);
571  Kokkos::parallel_for (policy, functor);
572  }
573 }
574 
575 } // namespace Impl
576 
577 namespace { // (anonymous)
578 
579 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
580 // version that takes two Kokkos::View arguments).
581 template<class Scalar, class LO, class GO, class Node>
582 class GetLocalDiagCopy {
583 public:
584  typedef typename Node::device_type device_type;
585  typedef size_t diag_offset_type;
586  typedef Kokkos::View<const size_t*, device_type,
587  Kokkos::MemoryUnmanaged> diag_offsets_type;
588  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
589  typedef typename global_graph_type::local_graph_type local_graph_type;
590  typedef typename local_graph_type::row_map_type row_offsets_type;
591  typedef typename ::Tpetra::Experimental::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
592  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
593  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
594 
595  // Constructor
596  GetLocalDiagCopy (const diag_type& diag,
597  const values_type& val,
598  const diag_offsets_type& diagOffsets,
599  const row_offsets_type& ptr,
600  const LO blockSize) :
601  diag_ (diag),
602  diagOffsets_ (diagOffsets),
603  ptr_ (ptr),
604  blockSize_ (blockSize),
605  offsetPerBlock_ (blockSize_*blockSize_),
606  val_(val)
607  {}
608 
609  KOKKOS_INLINE_FUNCTION void
610  operator() (const LO& lclRowInd) const
611  {
612  using Kokkos::ALL;
613 
614  // Get row offset
615  const size_t absOffset = ptr_[lclRowInd];
616 
617  // Get offset relative to start of row
618  const size_t relOffset = diagOffsets_[lclRowInd];
619 
620  // Get the total offset
621  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
622 
623  // Get a view of the block. BCRS currently uses LayoutRight
624  // regardless of the device.
625  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
626  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
627  const_little_block_type;
628  const_little_block_type D_in (val_.data () + pointOffset,
629  blockSize_, blockSize_);
630  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
631  COPY (D_in, D_out);
632  }
633 
634  private:
635  diag_type diag_;
636  diag_offsets_type diagOffsets_;
637  row_offsets_type ptr_;
638  LO blockSize_;
639  LO offsetPerBlock_;
640  values_type val_;
641  };
642 } // namespace (anonymous)
643 
644  template<class Scalar, class LO, class GO, class Node>
645  std::ostream&
646  BlockCrsMatrix<Scalar, LO, GO, Node>::
647  markLocalErrorAndGetStream ()
648  {
649  * (this->localError_) = true;
650  if ((*errs_).is_null ()) {
651  *errs_ = Teuchos::rcp (new std::ostringstream ());
652  }
653  return **errs_;
654  }
655 
656  template<class Scalar, class LO, class GO, class Node>
659  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
660  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
661  blockSize_ (static_cast<LO> (0)),
662  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
663  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
664  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
665  offsetPerBlock_ (0),
666  localError_ (new bool (false)),
667  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
668  {
669  }
670 
671  template<class Scalar, class LO, class GO, class Node>
674  const LO blockSize) :
675  dist_object_type (graph.getMap ()),
676  graph_ (graph),
677  rowMeshMap_ (* (graph.getRowMap ())),
678  blockSize_ (blockSize),
679  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
680  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
681  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
682  offsetPerBlock_ (blockSize * blockSize),
683  localError_ (new bool (false)),
684  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
685  {
686  TEUCHOS_TEST_FOR_EXCEPTION(
687  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
688  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
689  "rows (isSorted() is false). This class assumes sorted rows.");
690 
691  graphRCP_ = Teuchos::rcpFromRef(graph_);
692 
693  // Trick to test whether LO is nonpositive, without a compiler
694  // warning in case LO is unsigned (which is generally a bad idea
695  // anyway). I don't promise that the trick works, but it
696  // generally does with gcc at least, in my experience.
697  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
698  TEUCHOS_TEST_FOR_EXCEPTION(
699  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
700  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
701  " <= 0. The block size must be positive.");
702 
703  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
704  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
705 
706  {
707  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
708  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
709 
710  row_map_type ptr_d = graph.getLocalGraph ().row_map;
711  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
712  Kokkos::deep_copy (ptr_h_nc, ptr_d);
713  ptrHost_ = ptr_h_nc;
714  }
715  {
716  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
717  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
718 
719  entries_type ind_d = graph.getLocalGraph ().entries;
720  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
721  Kokkos::deep_copy (ind_h_nc, ind_d);
722  indHost_ = ind_h_nc;
723  }
724 
725  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
726  val_ = decltype (val_) ("val", numValEnt);
727  }
728 
729  template<class Scalar, class LO, class GO, class Node>
732  const map_type& domainPointMap,
733  const map_type& rangePointMap,
734  const LO blockSize) :
735  dist_object_type (graph.getMap ()),
736  graph_ (graph),
737  rowMeshMap_ (* (graph.getRowMap ())),
738  domainPointMap_ (domainPointMap),
739  rangePointMap_ (rangePointMap),
740  blockSize_ (blockSize),
741  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
742  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
743  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
744  offsetPerBlock_ (blockSize * blockSize),
745  localError_ (new bool (false)),
746  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
747  {
748  TEUCHOS_TEST_FOR_EXCEPTION(
749  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
750  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
751  "rows (isSorted() is false). This class assumes sorted rows.");
752 
753  graphRCP_ = Teuchos::rcpFromRef(graph_);
754 
755  // Trick to test whether LO is nonpositive, without a compiler
756  // warning in case LO is unsigned (which is generally a bad idea
757  // anyway). I don't promise that the trick works, but it
758  // generally does with gcc at least, in my experience.
759  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
760  TEUCHOS_TEST_FOR_EXCEPTION(
761  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
762  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
763  " <= 0. The block size must be positive.");
764 
765  {
766  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
767  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
768 
769  row_map_type ptr_d = graph.getLocalGraph ().row_map;
770  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
771  Kokkos::deep_copy (ptr_h_nc, ptr_d);
772  ptrHost_ = ptr_h_nc;
773  }
774  {
775  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
776  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
777 
778  entries_type ind_d = graph.getLocalGraph ().entries;
779  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
780  Kokkos::deep_copy (ind_h_nc, ind_d);
781  indHost_ = ind_h_nc;
782  }
783 
784  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
785  val_ = decltype (val_) ("val", numValEnt);
786  }
787 
788  template<class Scalar, class LO, class GO, class Node>
789  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
792  { // Copy constructor of map_type does a shallow copy.
793  // We're only returning by RCP for backwards compatibility.
794  return Teuchos::rcp (new map_type (domainPointMap_));
795  }
796 
797  template<class Scalar, class LO, class GO, class Node>
798  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
800  getRangeMap () const
801  { // Copy constructor of map_type does a shallow copy.
802  // We're only returning by RCP for backwards compatibility.
803  return Teuchos::rcp (new map_type (rangePointMap_));
804  }
805 
806  template<class Scalar, class LO, class GO, class Node>
807  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
809  getRowMap () const
810  {
811  return graph_.getRowMap();
812  }
813 
814  template<class Scalar, class LO, class GO, class Node>
815  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
817  getColMap () const
818  {
819  return graph_.getColMap();
820  }
821 
822  template<class Scalar, class LO, class GO, class Node>
826  {
827  return graph_.getGlobalNumRows();
828  }
829 
830  template<class Scalar, class LO, class GO, class Node>
831  size_t
834  {
835  return graph_.getNodeNumRows();
836  }
837 
838  template<class Scalar, class LO, class GO, class Node>
839  size_t
842  {
843  return graph_.getNodeMaxNumRowEntries();
844  }
845 
846  template<class Scalar, class LO, class GO, class Node>
847  void
849  apply (const mv_type& X,
850  mv_type& Y,
851  Teuchos::ETransp mode,
852  Scalar alpha,
853  Scalar beta) const
854  {
856  TEUCHOS_TEST_FOR_EXCEPTION(
857  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
858  std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
859  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
860  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
861 
862  BMV X_view;
863  BMV Y_view;
864  const LO blockSize = getBlockSize ();
865  try {
866  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
867  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
868  }
869  catch (std::invalid_argument& e) {
870  TEUCHOS_TEST_FOR_EXCEPTION(
871  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
872  "apply: Either the input MultiVector X or the output MultiVector Y "
873  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
874  "graph. BlockMultiVector's constructor threw the following exception: "
875  << e.what ());
876  }
877 
878  try {
879  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
880  // either that or mark fields of this class as 'mutable'. The
881  // problem is that applyBlock wants to do lazy initialization of
882  // temporary block multivectors.
883  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
884  } catch (std::invalid_argument& e) {
885  TEUCHOS_TEST_FOR_EXCEPTION(
886  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
887  "apply: The implementation method applyBlock complained about having "
888  "an invalid argument. It reported the following: " << e.what ());
889  } catch (std::logic_error& e) {
890  TEUCHOS_TEST_FOR_EXCEPTION(
891  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
892  "apply: The implementation method applyBlock complained about a "
893  "possible bug in its implementation. It reported the following: "
894  << e.what ());
895  } catch (std::exception& e) {
896  TEUCHOS_TEST_FOR_EXCEPTION(
897  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
898  "apply: The implementation method applyBlock threw an exception which "
899  "is neither std::invalid_argument nor std::logic_error, but is a "
900  "subclass of std::exception. It reported the following: "
901  << e.what ());
902  } catch (...) {
903  TEUCHOS_TEST_FOR_EXCEPTION(
904  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
905  "apply: The implementation method applyBlock threw an exception which "
906  "is not an instance of a subclass of std::exception. This probably "
907  "indicates a bug. Please report this to the Tpetra developers.");
908  }
909  }
910 
911  template<class Scalar, class LO, class GO, class Node>
912  void
916  Teuchos::ETransp mode,
917  const Scalar alpha,
918  const Scalar beta)
919  {
920  TEUCHOS_TEST_FOR_EXCEPTION(
921  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
922  "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
923  "X and Y have different block sizes. X.getBlockSize() = "
924  << X.getBlockSize () << " != Y.getBlockSize() = "
925  << Y.getBlockSize () << ".");
926 
927  if (mode == Teuchos::NO_TRANS) {
928  applyBlockNoTrans (X, Y, alpha, beta);
929  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
930  applyBlockTrans (X, Y, mode, alpha, beta);
931  } else {
932  TEUCHOS_TEST_FOR_EXCEPTION(
933  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
934  "applyBlock: Invalid 'mode' argument. Valid values are "
935  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
936  }
937  }
938 
939  template<class Scalar, class LO, class GO, class Node>
940  void
942  setAllToScalar (const Scalar& alpha)
943  {
944 #ifdef HAVE_TPETRA_DEBUG
945  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::setAllToScalar: ";
946 #endif // HAVE_TPETRA_DEBUG
947 
948  if (this->template need_sync<device_type> ()) {
949  // If we need to sync to device, then the data were last
950  // modified on host. In that case, we should again modify them
951  // on host.
952 #ifdef HAVE_TPETRA_DEBUG
953  TEUCHOS_TEST_FOR_EXCEPTION
954  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
955  prefix << "The matrix's values need sync on both device and host.");
956 #endif // HAVE_TPETRA_DEBUG
957  this->template modify<Kokkos::HostSpace> ();
958  Kokkos::deep_copy (this->template getValues<Kokkos::HostSpace> (), alpha);
959  }
960  else if (this->template need_sync<Kokkos::HostSpace> ()) {
961  // If we need to sync to host, then the data were last modified
962  // on device. In that case, we should again modify them on
963  // device.
964 #ifdef HAVE_TPETRA_DEBUG
965  TEUCHOS_TEST_FOR_EXCEPTION
966  (this->template need_sync<device_type> (), std::runtime_error,
967  prefix << "The matrix's values need sync on both host and device.");
968 #endif // HAVE_TPETRA_DEBUG
969  this->template modify<device_type> ();
970  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
971  }
972  else { // neither host nor device marked as modified, so modify on device
973  this->template modify<device_type> ();
974  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
975  }
976  }
977 
978  template<class Scalar, class LO, class GO, class Node>
979  LO
981  replaceLocalValues (const LO localRowInd,
982  const LO colInds[],
983  const Scalar vals[],
984  const LO numColInds) const
985  {
986 #ifdef HAVE_TPETRA_DEBUG
987  const char prefix[] =
988  "Tpetra::Experimental::BlockCrsMatrix::replaceLocalValues: ";
989 #endif // HAVE_TPETRA_DEBUG
990 
991  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
992  // We modified no values, because the input local row index is
993  // invalid on the calling process. That may not be an error, if
994  // numColInds is zero anyway; it doesn't matter. This is the
995  // advantage of returning the number of valid indices.
996  return static_cast<LO> (0);
997  }
998  const impl_scalar_type* const vIn =
999  reinterpret_cast<const impl_scalar_type*> (vals);
1000  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1001  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1002  const LO perBlockSize = this->offsetPerBlock ();
1003  LO hint = 0; // Guess for the relative offset into the current row
1004  LO pointOffset = 0; // Current offset into input values
1005  LO validCount = 0; // number of valid column indices in colInds
1006 
1007 #ifdef HAVE_TPETRA_DEBUG
1008  TEUCHOS_TEST_FOR_EXCEPTION
1009  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1010  prefix << "The matrix's data were last modified on device, but have "
1011  "not been sync'd to host. Please sync to host (by calling "
1012  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1013  "method.");
1014 #endif // HAVE_TPETRA_DEBUG
1015 
1016  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1017  // version of the data always exists (no lazy allocation for host
1018  // data).
1020  auto vals_host_out =
1021  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1022  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1023 
1024  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1025  const LO relBlockOffset =
1026  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1027  if (relBlockOffset != LINV) {
1028  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1029  // are stored contiguously, with no padding. "Contiguously"
1030  // means that all memory between the first and last entries
1031  // belongs to the block (no striding). "No padding" means
1032  // that getBlockSize() * getBlockSize() is exactly the number
1033  // of entries that the block uses. For another place where
1034  // this assumption is encoded, see sumIntoLocalValues.
1035 
1036  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1037  // little_block_type A_old =
1038  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1039  impl_scalar_type* const A_old =
1040  vals_host_out_raw + absBlockOffset * perBlockSize;
1041  // const_little_block_type A_new =
1042  // getConstLocalBlockFromInput (vIn, pointOffset);
1043  const impl_scalar_type* const A_new = vIn + pointOffset;
1044  // COPY (A_new, A_old);
1045  for (LO i = 0; i < perBlockSize; ++i) {
1046  A_old[i] = A_new[i];
1047  }
1048  hint = relBlockOffset + 1;
1049  ++validCount;
1050  }
1051  }
1052  return validCount;
1053  }
1054 
1055  template <class Scalar, class LO, class GO, class Node>
1056  void
1058  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1059  Kokkos::MemoryUnmanaged>& offsets) const
1060  {
1061  graph_.getLocalDiagOffsets (offsets);
1062  }
1063 
1064  template <class Scalar, class LO, class GO, class Node>
1065  void TPETRA_DEPRECATED
1067  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
1068  {
1069  // mfh 19 Mar 2016: We plan to deprecate the ArrayRCP version of
1070  // this method in CrsGraph too, so don't call it (otherwise build
1071  // warnings will show up and annoy users). Instead, copy results
1072  // in and out, if the memory space requires it.
1073 
1074  const size_t lclNumRows = graph_.getNodeNumRows ();
1075  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
1076  offsets.resize (lclNumRows);
1077  }
1078 
1079  // The input ArrayRCP must always be a host pointer. Thus, if
1080  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
1081  // to write to that allocation directly as a Kokkos::View.
1082  typedef typename device_type::memory_space memory_space;
1083  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
1084  // It is always syntactically correct to assign a raw host
1085  // pointer to a device View, so this code will compile correctly
1086  // even if this branch never runs.
1087  typedef Kokkos::View<size_t*, device_type,
1088  Kokkos::MemoryUnmanaged> output_type;
1089  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1090  graph_.getLocalDiagOffsets (offsetsOut);
1091  }
1092  else {
1093  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
1094  graph_.getLocalDiagOffsets (offsetsTmp);
1095  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
1096  Kokkos::MemoryUnmanaged> output_type;
1097  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1098  Kokkos::deep_copy (offsetsOut, offsetsTmp);
1099  }
1100  }
1101 
1102  template <class Scalar, class LO, class GO, class Node>
1103  void
1107  const Kokkos::View<impl_scalar_type***, device_type,
1108  Kokkos::MemoryUnmanaged>& D_inv,
1109  const Scalar& omega,
1110  const ESweepDirection direction) const
1111  {
1112  using Kokkos::ALL;
1113  const impl_scalar_type zero =
1114  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1115  const impl_scalar_type one =
1116  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1117  const LO numLocalMeshRows =
1118  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1119  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1120 
1121  // If using (new) Kokkos, replace localMem with thread-local
1122  // memory. Note that for larger block sizes, this will affect the
1123  // two-level parallelization. Look to Stokhos for best practice
1124  // on making this fast for GPUs.
1125  const LO blockSize = getBlockSize ();
1126  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1127  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1128  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1129 
1130  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1131  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1132  if (direction == Forward) {
1133  rowBegin = 1;
1134  rowEnd = numLocalMeshRows+1;
1135  rowStride = 1;
1136  }
1137  else if (direction == Backward) {
1138  rowBegin = numLocalMeshRows;
1139  rowEnd = 0;
1140  rowStride = -1;
1141  }
1142  else if (direction == Symmetric) {
1143  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1144  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1145  return;
1146  }
1147 
1148  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1149  const Scalar minus_omega = -omega;
1150 
1151  if (numVecs == 1) {
1152  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1153  const LO actlRow = lclRow - 1;
1154 
1155  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1156  COPY (B_cur, X_lcl);
1157  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1158 
1159  const size_t meshBeg = ptrHost_[actlRow];
1160  const size_t meshEnd = ptrHost_[actlRow+1];
1161  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1162  const LO meshCol = indHost_[absBlkOff];
1163  const_little_block_type A_cur =
1164  getConstLocalBlockFromAbsOffset (absBlkOff);
1165  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1166 
1167  // X_lcl += alpha*A_cur*X_cur
1168  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1169  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1170  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1171  } // for each entry in the current local row of the matrix
1172 
1173  // NOTE (mfh 20 Jan 2016) The two input Views here are
1174  // unmanaged already, so we don't have to take unmanaged
1175  // subviews first.
1176  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1177  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1178  FILL (X_update, zero);
1179  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1180  } // for each local row of the matrix
1181  }
1182  else {
1183  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1184  for (LO j = 0; j < numVecs; ++j) {
1185  LO actlRow = lclRow-1;
1186 
1187  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1188  COPY (B_cur, X_lcl);
1189  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1190 
1191  const size_t meshBeg = ptrHost_[actlRow];
1192  const size_t meshEnd = ptrHost_[actlRow+1];
1193  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1194  const LO meshCol = indHost_[absBlkOff];
1195  const_little_block_type A_cur =
1196  getConstLocalBlockFromAbsOffset (absBlkOff);
1197  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1198 
1199  // X_lcl += alpha*A_cur*X_cur
1200  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1201  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1202  } // for each entry in the current local row of the matrx
1203 
1204  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1205  auto X_update = X.getLocalBlock (actlRow, j);
1206  FILL (X_update, zero);
1207  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1208  } // for each entry in the current local row of the matrix
1209  } // for each local row of the matrix
1210  }
1211  }
1212 
1213  template <class Scalar, class LO, class GO, class Node>
1214  void
1219  const Scalar& dampingFactor,
1220  const ESweepDirection direction,
1221  const int numSweeps,
1222  const bool zeroInitialGuess) const
1223  {
1224  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1225  // interface for block Gauss-Seidel.
1226  TEUCHOS_TEST_FOR_EXCEPTION(
1227  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1228  "gaussSeidelCopy: Not implemented.");
1229  }
1230 
1231  template <class Scalar, class LO, class GO, class Node>
1232  void
1237  const Teuchos::ArrayView<LO>& rowIndices,
1238  const Scalar& dampingFactor,
1239  const ESweepDirection direction,
1240  const int numSweeps,
1241  const bool zeroInitialGuess) const
1242  {
1243  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1244  // interface for block Gauss-Seidel.
1245  TEUCHOS_TEST_FOR_EXCEPTION(
1246  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1247  "reorderedGaussSeidelCopy: Not implemented.");
1248  }
1249 
1250  template <class Scalar, class LO, class GO, class Node>
1251  void TPETRA_DEPRECATED
1254  const Teuchos::ArrayView<const size_t>& offsets) const
1255  {
1256  using Teuchos::ArrayRCP;
1257  using Teuchos::ArrayView;
1258  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1259 
1260  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
1261  const LO* columnIndices;
1262  Scalar* vals;
1263  LO numColumns;
1264  Teuchos::Array<LO> cols(1);
1265 
1266  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
1267  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
1268  for (size_t i = 0; i < myNumRows; ++i) {
1269  cols[0] = i;
1270  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1271  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
1272  }
1273  else {
1274  getLocalRowView (i, columnIndices, vals, numColumns);
1275  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
1276  }
1277  }
1278  }
1279 
1280 
1281  template <class Scalar, class LO, class GO, class Node>
1282  void
1285  Kokkos::MemoryUnmanaged>& diag,
1286  const Kokkos::View<const size_t*, device_type,
1287  Kokkos::MemoryUnmanaged>& offsets) const
1288  {
1289  using Kokkos::parallel_for;
1290  typedef typename device_type::execution_space execution_space;
1291  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1292 
1293  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1294  const LO blockSize = this->getBlockSize ();
1295  TEUCHOS_TEST_FOR_EXCEPTION
1296  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1297  static_cast<LO> (diag.extent (1)) < blockSize ||
1298  static_cast<LO> (diag.extent (2)) < blockSize,
1299  std::invalid_argument, prefix <<
1300  "The input Kokkos::View is not big enough to hold all the data.");
1301  TEUCHOS_TEST_FOR_EXCEPTION
1302  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1303  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1304  "diagonal blocks " << lclNumMeshRows << ".");
1305 
1306 #ifdef HAVE_TPETRA_DEBUG
1307  TEUCHOS_TEST_FOR_EXCEPTION
1308  (this->template need_sync<device_type> (), std::runtime_error,
1309  prefix << "The matrix's data were last modified on host, but have "
1310  "not been sync'd to device. Please sync to device (by calling "
1311  "sync<device_type>() on this matrix) before calling this method.");
1312 #endif // HAVE_TPETRA_DEBUG
1313 
1314  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1315  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1316 
1317  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1318  // we reserve the right to do lazy allocation of device data. (We
1319  // don't plan to do lazy allocation for host data; the host
1320  // version of the data always exists.)
1322  auto vals_dev =
1323  const_cast<this_type*> (this)->template getValues<device_type> ();
1324 
1325  parallel_for (policy_type (0, lclNumMeshRows),
1326  functor_type (diag, vals_dev, offsets,
1327  graph_.getLocalGraph ().row_map, blockSize_));
1328  }
1329 
1330 
1331  template <class Scalar, class LO, class GO, class Node>
1332  void
1335  Kokkos::MemoryUnmanaged>& diag,
1336  const Teuchos::ArrayView<const size_t>& offsets) const
1337  {
1338  using Kokkos::ALL;
1339  using Kokkos::parallel_for;
1340  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1341  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1342 
1343  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1344  const LO blockSize = this->getBlockSize ();
1345  TEUCHOS_TEST_FOR_EXCEPTION
1346  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1347  static_cast<LO> (diag.extent (1)) < blockSize ||
1348  static_cast<LO> (diag.extent (2)) < blockSize,
1349  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1350  "The input Kokkos::View is not big enough to hold all the data.");
1351  TEUCHOS_TEST_FOR_EXCEPTION
1352  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1353  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1354  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1355  "blocks " << lclNumMeshRows << ".");
1356 
1357  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1358  // quite made everything work with CUDA yet.
1359  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1360  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1361  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1362  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1363  COPY (D_in, D_out);
1364  });
1365  }
1366 
1367 
1368  template<class Scalar, class LO, class GO, class Node>
1369  LO
1371  absMaxLocalValues (const LO localRowInd,
1372  const LO colInds[],
1373  const Scalar vals[],
1374  const LO numColInds) const
1375  {
1376  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1377  // We modified no values, because the input local row index is
1378  // invalid on the calling process. That may not be an error, if
1379  // numColInds is zero anyway; it doesn't matter. This is the
1380  // advantage of returning the number of valid indices.
1381  return static_cast<LO> (0);
1382  }
1383  const impl_scalar_type* const vIn =
1384  reinterpret_cast<const impl_scalar_type*> (vals);
1385  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1386  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1387  const LO perBlockSize = this->offsetPerBlock ();
1388  LO hint = 0; // Guess for the relative offset into the current row
1389  LO pointOffset = 0; // Current offset into input values
1390  LO validCount = 0; // number of valid column indices in colInds
1391 
1392  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1393  const LO relBlockOffset =
1394  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1395  if (relBlockOffset != LINV) {
1396  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1397  little_block_type A_old =
1398  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1399  const_little_block_type A_new =
1400  getConstLocalBlockFromInput (vIn, pointOffset);
1401 
1402  ::Tpetra::Experimental::Impl::absMax (A_old, A_new);
1403  hint = relBlockOffset + 1;
1404  ++validCount;
1405  }
1406  }
1407  return validCount;
1408  }
1409 
1410 
1411  template<class Scalar, class LO, class GO, class Node>
1412  LO
1414  sumIntoLocalValues (const LO localRowInd,
1415  const LO colInds[],
1416  const Scalar vals[],
1417  const LO numColInds) const
1418  {
1419 #ifdef HAVE_TPETRA_DEBUG
1420  const char prefix[] =
1421  "Tpetra::Experimental::BlockCrsMatrix::sumIntoLocalValues: ";
1422 #endif // HAVE_TPETRA_DEBUG
1423 
1424  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1425  // We modified no values, because the input local row index is
1426  // invalid on the calling process. That may not be an error, if
1427  // numColInds is zero anyway; it doesn't matter. This is the
1428  // advantage of returning the number of valid indices.
1429  return static_cast<LO> (0);
1430  }
1431  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1432  const impl_scalar_type* const vIn =
1433  reinterpret_cast<const impl_scalar_type*> (vals);
1434  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1435  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1436  const LO perBlockSize = this->offsetPerBlock ();
1437  LO hint = 0; // Guess for the relative offset into the current row
1438  LO pointOffset = 0; // Current offset into input values
1439  LO validCount = 0; // number of valid column indices in colInds
1440 
1441 #ifdef HAVE_TPETRA_DEBUG
1442  TEUCHOS_TEST_FOR_EXCEPTION
1443  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1444  prefix << "The matrix's data were last modified on device, but have not "
1445  "been sync'd to host. Please sync to host (by calling "
1446  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1447 #endif // HAVE_TPETRA_DEBUG
1448 
1449  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1450  // version of the data always exists (no lazy allocation for host
1451  // data).
1453  auto vals_host_out =
1454  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1455  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1456 
1457  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1458  const LO relBlockOffset =
1459  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1460  if (relBlockOffset != LINV) {
1461  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1462  // are stored contiguously, with no padding. "Contiguously"
1463  // means that all memory between the first and last entries
1464  // belongs to the block (no striding). "No padding" means
1465  // that getBlockSize() * getBlockSize() is exactly the number
1466  // of entries that the block uses. For another place where
1467  // this assumption is encoded, see replaceLocalValues.
1468 
1469  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1470  // little_block_type A_old =
1471  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1472  impl_scalar_type* const A_old =
1473  vals_host_out_raw + absBlockOffset * perBlockSize;
1474  // const_little_block_type A_new =
1475  // getConstLocalBlockFromInput (vIn, pointOffset);
1476  const impl_scalar_type* const A_new = vIn + pointOffset;
1477  // AXPY (ONE, A_new, A_old);
1478  for (LO i = 0; i < perBlockSize; ++i) {
1479  A_old[i] += A_new[i];
1480  }
1481  hint = relBlockOffset + 1;
1482  ++validCount;
1483  }
1484  }
1485  return validCount;
1486  }
1487 
1488  template<class Scalar, class LO, class GO, class Node>
1489  LO
1491  getLocalRowView (const LO localRowInd,
1492  const LO*& colInds,
1493  Scalar*& vals,
1494  LO& numInds) const
1495  {
1496 #ifdef HAVE_TPETRA_DEBUG
1497  const char prefix[] =
1498  "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: ";
1499 #endif // HAVE_TPETRA_DEBUG
1500 
1501  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1502  colInds = NULL;
1503  vals = NULL;
1504  numInds = 0;
1505  return Teuchos::OrdinalTraits<LO>::invalid ();
1506  }
1507  else {
1508  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1509  colInds = indHost_.data () + absBlockOffsetStart;
1510 
1511 #ifdef HAVE_TPETRA_DEBUG
1512  TEUCHOS_TEST_FOR_EXCEPTION
1513  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1514  prefix << "The matrix's data were last modified on device, but have "
1515  "not been sync'd to host. Please sync to host (by calling "
1516  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1517  "method.");
1518 #endif // HAVE_TPETRA_DEBUG
1519 
1520  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1521  // version of the data always exists (no lazy allocation for host
1522  // data).
1524  auto vals_host_out =
1525  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1526  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1527  impl_scalar_type* const vOut = vals_host_out_raw +
1528  absBlockOffsetStart * offsetPerBlock ();
1529  vals = reinterpret_cast<Scalar*> (vOut);
1530 
1531  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1532  return 0; // indicates no error
1533  }
1534  }
1535 
1536  template<class Scalar, class LO, class GO, class Node>
1537  void
1539  getLocalRowCopy (LO LocalRow,
1540  const Teuchos::ArrayView<LO>& Indices,
1541  const Teuchos::ArrayView<Scalar>& Values,
1542  size_t &NumEntries) const
1543  {
1544  const LO *colInds;
1545  Scalar *vals;
1546  LO numInds;
1547  getLocalRowView(LocalRow,colInds,vals,numInds);
1548  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1549  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1550  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1551  << numInds << " row entries");
1552  }
1553  for (LO i=0; i<numInds; ++i) {
1554  Indices[i] = colInds[i];
1555  }
1556  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1557  Values[i] = vals[i];
1558  }
1559  NumEntries = numInds;
1560  }
1561 
1562  template<class Scalar, class LO, class GO, class Node>
1563  LO
1565  getLocalRowOffsets (const LO localRowInd,
1566  ptrdiff_t offsets[],
1567  const LO colInds[],
1568  const LO numColInds) const
1569  {
1570  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1571  // We got no offsets, because the input local row index is
1572  // invalid on the calling process. That may not be an error, if
1573  // numColInds is zero anyway; it doesn't matter. This is the
1574  // advantage of returning the number of valid indices.
1575  return static_cast<LO> (0);
1576  }
1577 
1578  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1579  LO hint = 0; // Guess for the relative offset into the current row
1580  LO validCount = 0; // number of valid column indices in colInds
1581 
1582  for (LO k = 0; k < numColInds; ++k) {
1583  const LO relBlockOffset =
1584  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1585  if (relBlockOffset != LINV) {
1586  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1587  hint = relBlockOffset + 1;
1588  ++validCount;
1589  }
1590  }
1591  return validCount;
1592  }
1593 
1594 
1595  template<class Scalar, class LO, class GO, class Node>
1596  LO
1598  replaceLocalValuesByOffsets (const LO localRowInd,
1599  const ptrdiff_t offsets[],
1600  const Scalar vals[],
1601  const LO numOffsets) const
1602  {
1603  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1604  // We modified no values, because the input local row index is
1605  // invalid on the calling process. That may not be an error, if
1606  // numColInds is zero anyway; it doesn't matter. This is the
1607  // advantage of returning the number of valid indices.
1608  return static_cast<LO> (0);
1609  }
1610  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1611 
1612  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1613  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1614  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1615  size_t pointOffset = 0; // Current offset into input values
1616  LO validCount = 0; // number of valid offsets
1617 
1618  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1619  const size_t relBlockOffset = offsets[k];
1620  if (relBlockOffset != STINV) {
1621  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1622  little_block_type A_old =
1623  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1624  const_little_block_type A_new =
1625  getConstLocalBlockFromInput (vIn, pointOffset);
1626  COPY (A_new, A_old);
1627  ++validCount;
1628  }
1629  }
1630  return validCount;
1631  }
1632 
1633 
1634  template<class Scalar, class LO, class GO, class Node>
1635  LO
1637  absMaxLocalValuesByOffsets (const LO localRowInd,
1638  const ptrdiff_t offsets[],
1639  const Scalar vals[],
1640  const LO numOffsets) const
1641  {
1642  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1643  // We modified no values, because the input local row index is
1644  // invalid on the calling process. That may not be an error, if
1645  // numColInds is zero anyway; it doesn't matter. This is the
1646  // advantage of returning the number of valid indices.
1647  return static_cast<LO> (0);
1648  }
1649  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1650 
1651  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1652  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1653  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1654  size_t pointOffset = 0; // Current offset into input values
1655  LO validCount = 0; // number of valid offsets
1656 
1657  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1658  const size_t relBlockOffset = offsets[k];
1659  if (relBlockOffset != STINV) {
1660  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1661  little_block_type A_old =
1662  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1663  const_little_block_type A_new =
1664  getConstLocalBlockFromInput (vIn, pointOffset);
1665  ::Tpetra::Experimental::Impl::absMax (A_old, A_new);
1666  ++validCount;
1667  }
1668  }
1669  return validCount;
1670  }
1671 
1672 
1673  template<class Scalar, class LO, class GO, class Node>
1674  LO
1676  sumIntoLocalValuesByOffsets (const LO localRowInd,
1677  const ptrdiff_t offsets[],
1678  const Scalar vals[],
1679  const LO numOffsets) const
1680  {
1681  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1682  // We modified no values, because the input local row index is
1683  // invalid on the calling process. That may not be an error, if
1684  // numColInds is zero anyway; it doesn't matter. This is the
1685  // advantage of returning the number of valid indices.
1686  return static_cast<LO> (0);
1687  }
1688  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1689  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1690 
1691  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1692  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1693  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1694  size_t pointOffset = 0; // Current offset into input values
1695  LO validCount = 0; // number of valid offsets
1696 
1697  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1698  const size_t relBlockOffset = offsets[k];
1699  if (relBlockOffset != STINV) {
1700  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1701  little_block_type A_old =
1702  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1703  const_little_block_type A_new =
1704  getConstLocalBlockFromInput (vIn, pointOffset);
1705  //A_old.update (ONE, A_new);
1706  AXPY (ONE, A_new, A_old);
1707  ++validCount;
1708  }
1709  }
1710  return validCount;
1711  }
1712 
1713 
1714  template<class Scalar, class LO, class GO, class Node>
1715  size_t
1717  getNumEntriesInLocalRow (const LO localRowInd) const
1718  {
1719  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1720  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1721  return static_cast<LO> (0); // the calling process doesn't have that row
1722  } else {
1723  return static_cast<LO> (numEntInGraph);
1724  }
1725  }
1726 
1727  template<class Scalar, class LO, class GO, class Node>
1728  void
1732  const Teuchos::ETransp mode,
1733  const Scalar alpha,
1734  const Scalar beta)
1735  {
1736  (void) X;
1737  (void) Y;
1738  (void) mode;
1739  (void) alpha;
1740  (void) beta;
1741 
1742  TEUCHOS_TEST_FOR_EXCEPTION(
1743  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
1744  "transpose and conjugate transpose modes are not yet implemented.");
1745  }
1746 
1747  template<class Scalar, class LO, class GO, class Node>
1748  void
1752  const Scalar alpha,
1753  const Scalar beta)
1754  {
1755  using Teuchos::RCP;
1756  using Teuchos::rcp;
1757  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1758  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1759  const Scalar zero = STS::zero ();
1760  const Scalar one = STS::one ();
1761  RCP<const import_type> import = graph_.getImporter ();
1762  // "export" is a reserved C++ keyword, so we can't use it.
1763  RCP<const export_type> theExport = graph_.getExporter ();
1764  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1765 
1766  if (alpha == zero) {
1767  if (beta == zero) {
1768  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1769  }
1770  else if (beta != one) {
1771  Y.scale (beta);
1772  }
1773  }
1774  else { // alpha != 0
1775  const BMV* X_colMap = NULL;
1776  if (import.is_null ()) {
1777  try {
1778  X_colMap = &X;
1779  }
1780  catch (std::exception& e) {
1781  TEUCHOS_TEST_FOR_EXCEPTION
1782  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1783  "operator= threw an exception: " << std::endl << e.what ());
1784  }
1785  }
1786  else {
1787  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1788  // Y_rowMap_ below. This lets us do lazy initialization
1789  // correctly with view semantics of BlockCrsMatrix. All views
1790  // of this BlockCrsMatrix have the same outer pointer. That
1791  // way, we can set the inner pointer in one view, and all
1792  // other views will see it.
1793  if ((*X_colMap_).is_null () ||
1794  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1795  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1796  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1797  static_cast<LO> (X.getNumVectors ())));
1798  }
1799 #ifdef HAVE_TPETRA_BCRS_DO_POINT_IMPORT
1800  if (pointImporter_->is_null ()) {
1801  // The Import ctor needs RCPs. Map's copy ctor does a shallow copy, so
1802  // these are small operations.
1803  const auto domainPointMap = rcp (new typename BMV::map_type (domainPointMap_));
1804  const auto colPointMap = rcp (new typename BMV::map_type (
1805  BMV::makePointMap (*graph_.getColMap(),
1806  blockSize_)));
1807  *pointImporter_ = rcp (new typename crs_graph_type::import_type (
1808  domainPointMap, colPointMap));
1809  }
1810  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1811  **pointImporter_,
1813 #else
1814  (**X_colMap_).doImport (X, *import, ::Tpetra::REPLACE);
1815 #endif
1816  try {
1817  X_colMap = &(**X_colMap_);
1818  }
1819  catch (std::exception& e) {
1820  TEUCHOS_TEST_FOR_EXCEPTION
1821  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1822  "operator= threw an exception: " << std::endl << e.what ());
1823  }
1824  }
1825 
1826  BMV* Y_rowMap = NULL;
1827  if (theExport.is_null ()) {
1828  Y_rowMap = &Y;
1829  }
1830  else if ((*Y_rowMap_).is_null () ||
1831  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1832  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1833  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1834  static_cast<LO> (X.getNumVectors ())));
1835  try {
1836  Y_rowMap = &(**Y_rowMap_);
1837  }
1838  catch (std::exception& e) {
1839  TEUCHOS_TEST_FOR_EXCEPTION(
1840  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1841  "operator= threw an exception: " << std::endl << e.what ());
1842  }
1843  }
1844 
1845  try {
1846  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1847  }
1848  catch (std::exception& e) {
1849  TEUCHOS_TEST_FOR_EXCEPTION
1850  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1851  "an exception: " << e.what ());
1852  }
1853  catch (...) {
1854  TEUCHOS_TEST_FOR_EXCEPTION
1855  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1856  "an exception not a subclass of std::exception.");
1857  }
1858 
1859  if (! theExport.is_null ()) {
1860  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1861  }
1862  }
1863  }
1864 
1865  template<class Scalar, class LO, class GO, class Node>
1866  void
1867  BlockCrsMatrix<Scalar, LO, GO, Node>::
1868  localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1869  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1870  const Scalar alpha,
1871  const Scalar beta)
1872  {
1873  using ::Tpetra::Experimental::Classes::Impl::bcrsLocalApplyNoTrans;
1874 
1875  const impl_scalar_type alpha_impl = alpha;
1876  const auto graph = this->graph_.getLocalGraph ();
1877  const impl_scalar_type beta_impl = beta;
1878  const LO blockSize = this->getBlockSize ();
1879 
1880  auto X_mv = X.getMultiVectorView ();
1881  auto Y_mv = Y.getMultiVectorView ();
1882  Y_mv.template modify<device_type> ();
1883 
1884  auto X_lcl = X_mv.template getLocalView<device_type> ();
1885  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1886  auto val = this->val_.template view<device_type> ();
1887 
1888  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1889  beta_impl, Y_lcl);
1890  }
1891 
1892  template<class Scalar, class LO, class GO, class Node>
1893  LO
1894  BlockCrsMatrix<Scalar, LO, GO, Node>::
1895  findRelOffsetOfColumnIndex (const LO localRowIndex,
1896  const LO colIndexToFind,
1897  const LO hint) const
1898  {
1899  const size_t absStartOffset = ptrHost_[localRowIndex];
1900  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1901  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1902  // Amortize pointer arithmetic over the search loop.
1903  const LO* const curInd = indHost_.data () + absStartOffset;
1904 
1905  // If the hint was correct, then the hint is the offset to return.
1906  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1907  // Always return the offset relative to the current row.
1908  return hint;
1909  }
1910 
1911  // The hint was wrong, so we must search for the given column
1912  // index in the column indices for the given row.
1913  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1914 
1915  // We require that the graph have sorted rows. However, binary
1916  // search only pays if the current row is longer than a certain
1917  // amount. We set this to 32, but you might want to tune this.
1918  const LO maxNumEntriesForLinearSearch = 32;
1919  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1920  // Use binary search. It would probably be better for us to
1921  // roll this loop by hand. If we wrote it right, a smart
1922  // compiler could perhaps use conditional loads and avoid
1923  // branches (according to Jed Brown on May 2014).
1924  const LO* beg = curInd;
1925  const LO* end = curInd + numEntriesInRow;
1926  std::pair<const LO*, const LO*> p =
1927  std::equal_range (beg, end, colIndexToFind);
1928  if (p.first != p.second) {
1929  // offset is relative to the current row
1930  relOffset = static_cast<LO> (p.first - beg);
1931  }
1932  }
1933  else { // use linear search
1934  for (LO k = 0; k < numEntriesInRow; ++k) {
1935  if (colIndexToFind == curInd[k]) {
1936  relOffset = k; // offset is relative to the current row
1937  break;
1938  }
1939  }
1940  }
1941 
1942  return relOffset;
1943  }
1944 
1945  template<class Scalar, class LO, class GO, class Node>
1946  LO
1947  BlockCrsMatrix<Scalar, LO, GO, Node>::
1948  offsetPerBlock () const
1949  {
1950  return offsetPerBlock_;
1951  }
1952 
1953  template<class Scalar, class LO, class GO, class Node>
1955  BlockCrsMatrix<Scalar, LO, GO, Node>::
1956  getConstLocalBlockFromInput (const impl_scalar_type* val,
1957  const size_t pointOffset) const
1958  {
1959  // Row major blocks
1960  const LO rowStride = blockSize_;
1961  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1962  }
1963 
1964  template<class Scalar, class LO, class GO, class Node>
1966  BlockCrsMatrix<Scalar, LO, GO, Node>::
1967  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1968  const size_t pointOffset) const
1969  {
1970  // Row major blocks
1971  const LO rowStride = blockSize_;
1972  return little_block_type (val + pointOffset, blockSize_, rowStride);
1973  }
1974 
1975  template<class Scalar, class LO, class GO, class Node>
1977  BlockCrsMatrix<Scalar, LO, GO, Node>::
1978  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1979  {
1980 #ifdef HAVE_TPETRA_DEBUG
1981  const char prefix[] =
1982  "Tpetra::Experimental::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
1983 #endif // HAVE_TPETRA_DEBUG
1984 
1985  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1986  // An empty block signifies an error. We don't expect to see
1987  // this error in correct code, but it's helpful for avoiding
1988  // memory corruption in case there is a bug.
1989  return const_little_block_type ();
1990  }
1991  else {
1992 #ifdef HAVE_TPETRA_DEBUG
1993  TEUCHOS_TEST_FOR_EXCEPTION
1994  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1995  prefix << "The matrix's data were last modified on device, but have "
1996  "not been sync'd to host. Please sync to host (by calling "
1997  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1998  "method.");
1999 #endif // HAVE_TPETRA_DEBUG
2000  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2001 
2002  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2003  // version of the data always exists (no lazy allocation for host
2004  // data).
2005  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2006  auto vals_host =
2007  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2008  const impl_scalar_type* vals_host_raw = vals_host.data ();
2009 
2010  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2011  }
2012  }
2013 
2014  template<class Scalar, class LO, class GO, class Node>
2016  BlockCrsMatrix<Scalar, LO, GO, Node>::
2017  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
2018  const size_t relMeshOffset) const
2019  {
2020  typedef impl_scalar_type IST;
2021 
2022  const LO* lclColInds = NULL;
2023  Scalar* lclVals = NULL;
2024  LO numEnt = 0;
2025 
2026  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
2027  if (err != 0) {
2028  // An empty block signifies an error. We don't expect to see
2029  // this error in correct code, but it's helpful for avoiding
2030  // memory corruption in case there is a bug.
2031  return const_little_block_type ();
2032  }
2033  else {
2034  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
2035  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
2036  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
2037  relPointOffset);
2038  }
2039  }
2040 
2041  template<class Scalar, class LO, class GO, class Node>
2043  BlockCrsMatrix<Scalar, LO, GO, Node>::
2044  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2045  {
2046 #ifdef HAVE_TPETRA_DEBUG
2047  const char prefix[] =
2048  "Tpetra::Experimental::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
2049 #endif // HAVE_TPETRA_DEBUG
2050 
2051  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2052  // An empty block signifies an error. We don't expect to see
2053  // this error in correct code, but it's helpful for avoiding
2054  // memory corruption in case there is a bug.
2055  return little_block_type ();
2056  }
2057  else {
2058  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2059 #ifdef HAVE_TPETRA_DEBUG
2060  TEUCHOS_TEST_FOR_EXCEPTION
2061  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
2062  prefix << "The matrix's data were last modified on device, but have "
2063  "not been sync'd to host. Please sync to host (by calling "
2064  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2065  "method.");
2066 #endif // HAVE_TPETRA_DEBUG
2067  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2068  // version of the data always exists (no lazy allocation for host
2069  // data).
2070  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2071  auto vals_host =
2072  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2073  impl_scalar_type* vals_host_raw = vals_host.data ();
2074  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2075  }
2076  }
2077 
2078  template<class Scalar, class LO, class GO, class Node>
2080  BlockCrsMatrix<Scalar, LO, GO, Node>::
2081  getLocalBlock (const LO localRowInd, const LO localColInd) const
2082  {
2083  const size_t absRowBlockOffset = ptrHost_[localRowInd];
2084  const LO relBlockOffset =
2085  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
2086 
2087  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
2088  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
2089  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
2090  }
2091  else {
2092  return little_block_type ();
2093  }
2094  }
2095 
2096  // template<class Scalar, class LO, class GO, class Node>
2097  // void
2098  // BlockCrsMatrix<Scalar, LO, GO, Node>::
2099  // clearLocalErrorStateAndStream ()
2100  // {
2101  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2102  // * (const_cast<this_type*> (this)->localError_) = false;
2103  // *errs_ = Teuchos::null;
2104  // }
2105 
2106  template<class Scalar, class LO, class GO, class Node>
2107  bool
2108  BlockCrsMatrix<Scalar, LO, GO, Node>::
2109  checkSizes (const ::Tpetra::SrcDistObject& source)
2110  {
2111  using std::endl;
2112  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2113  const this_type* src = dynamic_cast<const this_type* > (&source);
2114 
2115  if (src == NULL) {
2116  std::ostream& err = markLocalErrorAndGetStream ();
2117  err << "checkSizes: The source object of the Import or Export "
2118  "must be a BlockCrsMatrix with the same template parameters as the "
2119  "target object." << endl;
2120  }
2121  else {
2122  // Use a string of ifs, not if-elseifs, because we want to know
2123  // all the errors.
2124  if (src->getBlockSize () != this->getBlockSize ()) {
2125  std::ostream& err = markLocalErrorAndGetStream ();
2126  err << "checkSizes: The source and target objects of the Import or "
2127  << "Export must have the same block sizes. The source's block "
2128  << "size = " << src->getBlockSize () << " != the target's block "
2129  << "size = " << this->getBlockSize () << "." << endl;
2130  }
2131  if (! src->graph_.isFillComplete ()) {
2132  std::ostream& err = markLocalErrorAndGetStream ();
2133  err << "checkSizes: The source object of the Import or Export is "
2134  "not fill complete. Both source and target objects must be fill "
2135  "complete." << endl;
2136  }
2137  if (! this->graph_.isFillComplete ()) {
2138  std::ostream& err = markLocalErrorAndGetStream ();
2139  err << "checkSizes: The target object of the Import or Export is "
2140  "not fill complete. Both source and target objects must be fill "
2141  "complete." << endl;
2142  }
2143  if (src->graph_.getColMap ().is_null ()) {
2144  std::ostream& err = markLocalErrorAndGetStream ();
2145  err << "checkSizes: The source object of the Import or Export does "
2146  "not have a column Map. Both source and target objects must have "
2147  "column Maps." << endl;
2148  }
2149  if (this->graph_.getColMap ().is_null ()) {
2150  std::ostream& err = markLocalErrorAndGetStream ();
2151  err << "checkSizes: The target object of the Import or Export does "
2152  "not have a column Map. Both source and target objects must have "
2153  "column Maps." << endl;
2154  }
2155  }
2156 
2157  return ! (* (this->localError_));
2158  }
2159 
2160  template<class Scalar, class LO, class GO, class Node>
2161  void
2162  BlockCrsMatrix<Scalar, LO, GO, Node>::
2163  copyAndPermute (const ::Tpetra::SrcDistObject& source,
2164  size_t numSameIDs,
2165  const Teuchos::ArrayView<const LO>& permuteToLIDs,
2166  const Teuchos::ArrayView<const LO>& permuteFromLIDs)
2167  {
2168  using std::endl;
2169  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2170  const bool debug = false;
2171 
2172  if (debug) {
2173  std::ostringstream os;
2174  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2175  os << "Proc " << myRank << ": copyAndPermute: "
2176  << "numSameIDs: " << numSameIDs
2177  << ", permuteToLIDs.size(): " << permuteToLIDs.size ()
2178  << ", permuteFromLIDs.size(): " << permuteFromLIDs.size ()
2179  << endl;
2180  std::cerr << os.str ();
2181  }
2182 
2183  // There's no communication in this method, so it's safe just to
2184  // return on error.
2185 
2186  if (* (this->localError_)) {
2187  std::ostream& err = this->markLocalErrorAndGetStream ();
2188  err << "copyAndPermute: The target object of the Import or Export is "
2189  "already in an error state." << endl;
2190  return;
2191  }
2192  if (permuteToLIDs.size () != permuteFromLIDs.size ()) {
2193  std::ostream& err = this->markLocalErrorAndGetStream ();
2194  err << "copyAndPermute: permuteToLIDs.size() = " << permuteToLIDs.size ()
2195  << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."
2196  << endl;
2197  return;
2198  }
2199 
2200  const this_type* src = dynamic_cast<const this_type* > (&source);
2201  if (src == NULL) {
2202  std::ostream& err = this->markLocalErrorAndGetStream ();
2203  err << "copyAndPermute: The source object of the Import or Export is "
2204  "either not a BlockCrsMatrix, or does not have the right template "
2205  "parameters. checkSizes() should have caught this. "
2206  "Please report this bug to the Tpetra developers." << endl;
2207  return;
2208  }
2209  if (* (src->localError_)) {
2210  std::ostream& err = this->markLocalErrorAndGetStream ();
2211  err << "copyAndPermute: The source object of the Import or Export is "
2212  "already in an error state." << endl;
2213  return;
2214  }
2215 
2216  bool lclErr = false;
2217 #ifdef HAVE_TPETRA_DEBUG
2218  std::set<LO> invalidSrcCopyRows;
2219  std::set<LO> invalidDstCopyRows;
2220  std::set<LO> invalidDstCopyCols;
2221  std::set<LO> invalidDstPermuteCols;
2222  std::set<LO> invalidPermuteFromRows;
2223 #endif // HAVE_TPETRA_DEBUG
2224 
2225  // Copy the initial sequence of rows that are the same.
2226  //
2227  // The two graphs might have different column Maps, so we need to
2228  // do this using global column indices. This is purely local, so
2229  // we only need to check for local sameness of the two column
2230  // Maps.
2231 
2232 #ifdef HAVE_TPETRA_DEBUG
2233  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2234 #endif // HAVE_TPETRA_DEBUG
2235  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2236  const map_type& srcColMap = * (src->graph_.getColMap ());
2237  const map_type& dstColMap = * (this->graph_.getColMap ());
2238  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2239  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.size ());
2240 
2241  if (debug) {
2242  std::ostringstream os;
2243  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2244  os << "Proc " << myRank << ": copyAndPermute: "
2245  << "canUseLocalColumnIndices: "
2246  << (canUseLocalColumnIndices ? "true" : "false")
2247  << ", numPermute: " << numPermute
2248  << endl;
2249  std::cerr << os.str ();
2250  }
2251 
2252  if (canUseLocalColumnIndices) {
2253  // Copy local rows that are the "same" in both source and target.
2254  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2255 #ifdef HAVE_TPETRA_DEBUG
2256  if (! srcRowMap.isNodeLocalElement (localRow)) {
2257  lclErr = true;
2258  invalidSrcCopyRows.insert (localRow);
2259  continue; // skip invalid rows
2260  }
2261 #endif // HAVE_TPETRA_DEBUG
2262 
2263  const LO* lclSrcCols;
2264  Scalar* vals;
2265  LO numEntries;
2266  // If this call fails, that means the mesh row local index is
2267  // invalid. That means the Import or Export is invalid somehow.
2268  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2269  if (err != 0) {
2270  lclErr = true;
2271 #ifdef HAVE_TPETRA_DEBUG
2272  (void) invalidSrcCopyRows.insert (localRow);
2273 #endif // HAVE_TPETRA_DEBUG
2274  }
2275  else {
2276  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2277  if (err != numEntries) {
2278  lclErr = true;
2279  if (! dstRowMap.isNodeLocalElement (localRow)) {
2280 #ifdef HAVE_TPETRA_DEBUG
2281  invalidDstCopyRows.insert (localRow);
2282 #endif // HAVE_TPETRA_DEBUG
2283  }
2284  else {
2285  // Once there's an error, there's no sense in saving
2286  // time, so we check whether the column indices were
2287  // invalid. However, only remember which ones were
2288  // invalid in a debug build, because that might take a
2289  // lot of space.
2290  for (LO k = 0; k < numEntries; ++k) {
2291  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2292  lclErr = true;
2293 #ifdef HAVE_TPETRA_DEBUG
2294  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2295 #endif // HAVE_TPETRA_DEBUG
2296  }
2297  }
2298  }
2299  }
2300  }
2301  } // for each "same" local row
2302 
2303  // Copy the "permute" local rows.
2304  for (size_t k = 0; k < numPermute; ++k) {
2305  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2306  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2307 
2308  const LO* lclSrcCols;
2309  Scalar* vals;
2310  LO numEntries;
2311  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2312  if (err != 0) {
2313  lclErr = true;
2314 #ifdef HAVE_TPETRA_DEBUG
2315  invalidPermuteFromRows.insert (srcLclRow);
2316 #endif // HAVE_TPETRA_DEBUG
2317  }
2318  else {
2319  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2320  if (err != numEntries) {
2321  lclErr = true;
2322 #ifdef HAVE_TPETRA_DEBUG
2323  for (LO c = 0; c < numEntries; ++c) {
2324  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2325  invalidDstPermuteCols.insert (lclSrcCols[c]);
2326  }
2327  }
2328 #endif // HAVE_TPETRA_DEBUG
2329  }
2330  }
2331  }
2332  }
2333  else { // must convert column indices to global
2334  // Reserve space to store the destination matrix's local column indices.
2335  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2336  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2337 
2338  // Copy local rows that are the "same" in both source and target.
2339  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2340  const LO* lclSrcCols;
2341  Scalar* vals;
2342  LO numEntries;
2343  // If this call fails, that means the mesh row local index is
2344  // invalid. That means the Import or Export is invalid somehow.
2345  LO err = 0;
2346  try {
2347  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2348  } catch (std::exception& e) {
2349  if (debug) {
2350  std::ostringstream os;
2351  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2352  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2353  << localRow << ", src->getLocalRowView() threw an exception: "
2354  << e.what ();
2355  std::cerr << os.str ();
2356  }
2357  throw e;
2358  }
2359 
2360  if (err != 0) {
2361  lclErr = true;
2362 #ifdef HAVE_TPETRA_DEBUG
2363  invalidSrcCopyRows.insert (localRow);
2364 #endif // HAVE_TPETRA_DEBUG
2365  }
2366  else {
2367  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2368  lclErr = true;
2369  if (debug) {
2370  std::ostringstream os;
2371  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2372  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2373  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2374  << maxNumEnt << endl;
2375  std::cerr << os.str ();
2376  }
2377  }
2378  else {
2379  // Convert the source matrix's local column indices to the
2380  // destination matrix's local column indices.
2381  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2382  for (LO j = 0; j < numEntries; ++j) {
2383  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2384  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2385  lclErr = true;
2386 #ifdef HAVE_TPETRA_DEBUG
2387  invalidDstCopyCols.insert (lclDstColsView[j]);
2388 #endif // HAVE_TPETRA_DEBUG
2389  }
2390  }
2391  try {
2392  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2393  } catch (std::exception& e) {
2394  if (debug) {
2395  std::ostringstream os;
2396  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2397  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2398  << localRow << ", this->replaceLocalValues() threw an exception: "
2399  << e.what ();
2400  std::cerr << os.str ();
2401  }
2402  throw e;
2403  }
2404  if (err != numEntries) {
2405  lclErr = true;
2406  if (debug) {
2407  std::ostringstream os;
2408  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2409  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2410  "localRow " << localRow << ", this->replaceLocalValues "
2411  "returned " << err << " instead of numEntries = "
2412  << numEntries << endl;
2413  std::cerr << os.str ();
2414  }
2415  }
2416  }
2417  }
2418  }
2419 
2420  // Copy the "permute" local rows.
2421  for (size_t k = 0; k < numPermute; ++k) {
2422  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2423  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2424 
2425  const LO* lclSrcCols;
2426  Scalar* vals;
2427  LO numEntries;
2428  LO err = 0;
2429  try {
2430  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2431  } catch (std::exception& e) {
2432  if (debug) {
2433  std::ostringstream os;
2434  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2435  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2436  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2437  << ", src->getLocalRowView() threw an exception: " << e.what ();
2438  std::cerr << os.str ();
2439  }
2440  throw e;
2441  }
2442 
2443  if (err != 0) {
2444  lclErr = true;
2445 #ifdef HAVE_TPETRA_DEBUG
2446  invalidPermuteFromRows.insert (srcLclRow);
2447 #endif // HAVE_TPETRA_DEBUG
2448  }
2449  else {
2450  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2451  lclErr = true;
2452  }
2453  else {
2454  // Convert the source matrix's local column indices to the
2455  // destination matrix's local column indices.
2456  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2457  for (LO j = 0; j < numEntries; ++j) {
2458  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2459  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2460  lclErr = true;
2461 #ifdef HAVE_TPETRA_DEBUG
2462  invalidDstPermuteCols.insert (lclDstColsView[j]);
2463 #endif // HAVE_TPETRA_DEBUG
2464  }
2465  }
2466  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2467  if (err != numEntries) {
2468  lclErr = true;
2469  }
2470  }
2471  }
2472  }
2473  }
2474 
2475  if (lclErr) {
2476  std::ostream& err = this->markLocalErrorAndGetStream ();
2477 #ifdef HAVE_TPETRA_DEBUG
2478  err << "copyAndPermute: The graph structure of the source of the "
2479  "Import or Export must be a subset of the graph structure of the "
2480  "target. ";
2481  err << "invalidSrcCopyRows = [";
2482  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2483  it != invalidSrcCopyRows.end (); ++it) {
2484  err << *it;
2485  typename std::set<LO>::const_iterator itp1 = it;
2486  itp1++;
2487  if (itp1 != invalidSrcCopyRows.end ()) {
2488  err << ",";
2489  }
2490  }
2491  err << "], invalidDstCopyRows = [";
2492  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2493  it != invalidDstCopyRows.end (); ++it) {
2494  err << *it;
2495  typename std::set<LO>::const_iterator itp1 = it;
2496  itp1++;
2497  if (itp1 != invalidDstCopyRows.end ()) {
2498  err << ",";
2499  }
2500  }
2501  err << "], invalidDstCopyCols = [";
2502  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2503  it != invalidDstCopyCols.end (); ++it) {
2504  err << *it;
2505  typename std::set<LO>::const_iterator itp1 = it;
2506  itp1++;
2507  if (itp1 != invalidDstCopyCols.end ()) {
2508  err << ",";
2509  }
2510  }
2511  err << "], invalidDstPermuteCols = [";
2512  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2513  it != invalidDstPermuteCols.end (); ++it) {
2514  err << *it;
2515  typename std::set<LO>::const_iterator itp1 = it;
2516  itp1++;
2517  if (itp1 != invalidDstPermuteCols.end ()) {
2518  err << ",";
2519  }
2520  }
2521  err << "], invalidPermuteFromRows = [";
2522  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2523  it != invalidPermuteFromRows.end (); ++it) {
2524  err << *it;
2525  typename std::set<LO>::const_iterator itp1 = it;
2526  itp1++;
2527  if (itp1 != invalidPermuteFromRows.end ()) {
2528  err << ",";
2529  }
2530  }
2531  err << "]" << std::endl;
2532 #else
2533  err << "copyAndPermute: The graph structure of the source of the "
2534  "Import or Export must be a subset of the graph structure of the "
2535  "target." << std::endl;
2536 #endif // HAVE_TPETRA_DEBUG
2537  }
2538 
2539  if (debug) {
2540  std::ostringstream os;
2541  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2542  const bool lclSuccess = ! (* (this->localError_));
2543  os << "*** Proc " << myRank << ": copyAndPermute "
2544  << (lclSuccess ? "succeeded" : "FAILED");
2545  if (lclSuccess) {
2546  os << endl;
2547  } else {
2548  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2549  }
2550  std::cerr << os.str ();
2551  }
2552  }
2553 
2554  namespace { // (anonymous)
2555 
2574  template<class LO, class GO, class D>
2575  size_t
2576  packRowCount (const size_t numEnt,
2577  const size_t numBytesPerValue,
2578  const size_t blkSize)
2579  {
2580  using ::Tpetra::Details::PackTraits;
2581 
2582  if (numEnt == 0) {
2583  // Empty rows always take zero bytes, to ensure sparsity.
2584  return 0;
2585  }
2586  else {
2587  // We store the number of entries as a local index (LO).
2588  LO numEntLO = 0; // packValueCount wants this.
2589  GO gid;
2590  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2591  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2592  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2593  return numEntLen + gidsLen + valsLen;
2594  }
2595  }
2596 
2607  template<class ST, class LO, class GO, class D>
2608  size_t
2609  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
2610  const size_t offset,
2611  const size_t numBytes,
2612  const size_t numBytesPerValue)
2613  {
2614  using Kokkos::subview;
2615  using ::Tpetra::Details::PackTraits;
2616 
2617  if (numBytes == 0) {
2618  // Empty rows always take zero bytes, to ensure sparsity.
2619  return static_cast<size_t> (0);
2620  }
2621  else {
2622  LO numEntLO = 0;
2623 #ifdef HAVE_TPETRA_DEBUG
2624  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2625  TEUCHOS_TEST_FOR_EXCEPTION(
2626  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2627  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2628  << ".");
2629 #endif // HAVE_TPETRA_DEBUG
2630  const char* const inBuf = imports.data () + offset;
2631  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2632 #ifdef HAVE_TPETRA_DEBUG
2633  TEUCHOS_TEST_FOR_EXCEPTION(
2634  actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2635  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2636  << ".");
2637 #else
2638  (void) actualNumBytes;
2639 #endif // HAVE_TPETRA_DEBUG
2640  return static_cast<size_t> (numEntLO);
2641  }
2642  }
2643 
2651  template<class ST, class LO, class GO, class D>
2652  size_t
2653  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
2654  const size_t offset,
2655  const size_t numEnt,
2656  const typename ::Tpetra::Details::PackTraits<GO, D>::input_array_type& gidsIn,
2657  const typename ::Tpetra::Details::PackTraits<ST, D>::input_array_type& valsIn,
2658  const size_t numBytesPerValue,
2659  const size_t blockSize)
2660  {
2661  using Kokkos::subview;
2662  using ::Tpetra::Details::PackTraits;
2663 
2664  if (numEnt == 0) {
2665  // Empty rows always take zero bytes, to ensure sparsity.
2666  return 0;
2667  }
2668  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2669 
2670  const GO gid = 0; // packValueCount wants this
2671  const LO numEntLO = static_cast<size_t> (numEnt);
2672 
2673  const size_t numEntBeg = offset;
2674  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2675  const size_t gidsBeg = numEntBeg + numEntLen;
2676  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2677  const size_t valsBeg = gidsBeg + gidsLen;
2678  const size_t valsLen = numScalarEnt * numBytesPerValue;
2679 
2680  char* const numEntOut = exports.data () + numEntBeg;
2681  char* const gidsOut = exports.data () + gidsBeg;
2682  char* const valsOut = exports.data () + valsBeg;
2683 
2684  size_t numBytesOut = 0;
2685  int errorCode = 0;
2686  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2687 
2688  {
2689  Kokkos::pair<int, size_t> p;
2690  p = PackTraits<GO, D>::packArray (gidsOut, gidsIn.data (), numEnt);
2691  errorCode += p.first;
2692  numBytesOut += p.second;
2693 
2694  p = PackTraits<ST, D>::packArray (valsOut, valsIn.data (), numScalarEnt);
2695  errorCode += p.first;
2696  numBytesOut += p.second;
2697  }
2698 
2699  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2700  TEUCHOS_TEST_FOR_EXCEPTION(
2701  numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
2702  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2703  << expectedNumBytes << ".");
2704 
2705  TEUCHOS_TEST_FOR_EXCEPTION(
2706  errorCode != 0, std::runtime_error, "packRow: "
2707  "PackTraits::packArray returned a nonzero error code");
2708 
2709  return numBytesOut;
2710  }
2711 
2712  // Return the number of bytes actually read / used.
2713  template<class ST, class LO, class GO, class D>
2714  size_t
2715  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2716  const typename ::Tpetra::Details::PackTraits<ST, D>::output_array_type& valsOut,
2717  const typename ::Tpetra::Details::PackTraits<int, D>::input_buffer_type& imports,
2718  const size_t offset,
2719  const size_t numBytes,
2720  const size_t numEnt,
2721  const size_t numBytesPerValue,
2722  const size_t blockSize)
2723  {
2724  using ::Tpetra::Details::PackTraits;
2725 
2726  if (numBytes == 0) {
2727  // Rows with zero bytes always have zero entries.
2728  return 0;
2729  }
2730  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2731  TEUCHOS_TEST_FOR_EXCEPTION(
2732  static_cast<size_t> (imports.extent (0)) <= offset,
2733  std::logic_error, "unpackRow: imports.extent(0) = "
2734  << imports.extent (0) << " <= offset = " << offset << ".");
2735  TEUCHOS_TEST_FOR_EXCEPTION(
2736  static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2737  std::logic_error, "unpackRow: imports.extent(0) = "
2738  << imports.extent (0) << " < offset + numBytes = "
2739  << (offset + numBytes) << ".");
2740 
2741  const GO gid = 0; // packValueCount wants this
2742  const LO lid = 0; // packValueCount wants this
2743 
2744  const size_t numEntBeg = offset;
2745  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2746  const size_t gidsBeg = numEntBeg + numEntLen;
2747  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2748  const size_t valsBeg = gidsBeg + gidsLen;
2749  const size_t valsLen = numScalarEnt * numBytesPerValue;
2750 
2751  const char* const numEntIn = imports.data () + numEntBeg;
2752  const char* const gidsIn = imports.data () + gidsBeg;
2753  const char* const valsIn = imports.data () + valsBeg;
2754 
2755  size_t numBytesOut = 0;
2756  int errorCode = 0;
2757  LO numEntOut;
2758  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2759  TEUCHOS_TEST_FOR_EXCEPTION(
2760  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2761  "unpackRow: Expected number of entries " << numEnt
2762  << " != actual number of entries " << numEntOut << ".");
2763 
2764  {
2765  Kokkos::pair<int, size_t> p;
2766  p = PackTraits<GO, D>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2767  errorCode += p.first;
2768  numBytesOut += p.second;
2769 
2770  p = PackTraits<ST, D>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2771  errorCode += p.first;
2772  numBytesOut += p.second;
2773  }
2774 
2775  TEUCHOS_TEST_FOR_EXCEPTION(
2776  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
2777  << numBytesOut << " != numBytes = " << numBytes << ".");
2778 
2779  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2780  TEUCHOS_TEST_FOR_EXCEPTION(
2781  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2782  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2783  << expectedNumBytes << ".");
2784 
2785  TEUCHOS_TEST_FOR_EXCEPTION(
2786  errorCode != 0, std::runtime_error, "unpackRow: "
2787  "PackTraits::unpackArray returned a nonzero error code");
2788 
2789  return numBytesOut;
2790  }
2791  } // namespace (anonymous)
2792 
2793  template<class Scalar, class LO, class GO, class Node>
2794  void
2795  BlockCrsMatrix<Scalar, LO, GO, Node>::
2796  packAndPrepare (const ::Tpetra::SrcDistObject& source,
2797  const Teuchos::ArrayView<const LO>& exportLIDs,
2798  Teuchos::Array<packet_type>& exports,
2799  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2800  size_t& constantNumPackets,
2801  ::Tpetra::Distributor& /* distor */)
2802  {
2803  using std::endl;
2804  using ::Tpetra::Details::PackTraits;
2805  using Kokkos::MemoryUnmanaged;
2806  using Kokkos::subview;
2807  using Kokkos::View;
2808  typedef typename ::Tpetra::MultiVector<Scalar, LO, GO, Node>::impl_scalar_type ST;
2809  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
2810  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2811  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2812  const bool debug = false;
2813 
2814  if (debug) {
2815  std::ostringstream os;
2816  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2817  os << "Proc " << myRank << ": packAndPrepare: exportLIDs.size() = "
2818  << exportLIDs.size () << ", numPacketsPerLID.size() = "
2819  << numPacketsPerLID.size () << endl;
2820  std::cerr << os.str ();
2821  }
2822 
2823  if (* (this->localError_)) {
2824  std::ostream& err = this->markLocalErrorAndGetStream ();
2825  err << "packAndPrepare: The target object of the Import or Export is "
2826  "already in an error state." << endl;
2827  return;
2828  }
2829 
2830  const this_type* src = dynamic_cast<const this_type* > (&source);
2831  // Should have checked for these cases in checkSizes().
2832  if (src == NULL) {
2833  std::ostream& err = this->markLocalErrorAndGetStream ();
2834  err << "packAndPrepare: The source (input) object of the Import or "
2835  "Export is either not a BlockCrsMatrix, or does not have the right "
2836  "template parameters. checkSizes() should have caught this. "
2837  "Please report this bug to the Tpetra developers." << endl;
2838  return;
2839  }
2840 
2841  const crs_graph_type& srcGraph = src->graph_;
2842  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2843  const size_type numExportLIDs = exportLIDs.size ();
2844 
2845  if (numExportLIDs != numPacketsPerLID.size ()) {
2846  std::ostream& err = this->markLocalErrorAndGetStream ();
2847  err << "packAndPrepare: exportLIDs.size() = " << numExportLIDs
2848  << " != numPacketsPerLID.size() = " << numPacketsPerLID.size ()
2849  << "." << endl;
2850  return;
2851  }
2852 
2853  // Graphs and matrices are allowed to have a variable number of
2854  // entries per row. We could test whether all rows have the same
2855  // number of entries, but DistObject can only use this
2856  // optimization if all rows on _all_ processes have the same
2857  // number of entries. Rather than do the all-reduce necessary to
2858  // test for this unlikely case, we tell DistObject (by setting
2859  // constantNumPackets to zero) to assume that different rows may
2860  // have different numbers of entries.
2861  constantNumPackets = 0;
2862 
2863  // Compute the number of bytes ("packets") per row to pack. While
2864  // we're at it, compute the total # of block entries to send, and
2865  // the max # of block entries in any of the rows we're sending.
2866  size_t totalNumBytes = 0;
2867  size_t totalNumEntries = 0;
2868  size_t maxRowLength = 0;
2869  for (size_type i = 0; i < numExportLIDs; ++i) {
2870  const LO lclRow = exportLIDs[i];
2871  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2872  // If any given LIDs are invalid, the above might return either
2873  // zero or the invalid size_t value. If the former, we have no
2874  // way to tell, but that's OK; it just means the calling process
2875  // won't pack anything (it has nothing to pack anyway). If the
2876  // latter, we replace it with zero (that row is not owned by the
2877  // calling process, so it has no entries to pack).
2878  if (numEnt == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2879  numEnt = 0;
2880  }
2881  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2882 
2883  // The 'if' branch implicitly assumes that packRowCount() returns
2884  // zero if numEnt == 0.
2885  size_t numBytesPerValue = 0;
2886  if (numEnt > 0) {
2887  // Get a locally indexed view of the current row's data. If
2888  // the current row has > 0 entries, we need an entry in order
2889  // to figure out the byte count of the packed row. (We really
2890  // only need it if ST's size is determined at run time.)
2891  Scalar* valsRaw = NULL;
2892  const LO* lidsRaw = NULL;
2893  LO actualNumEnt = 0;
2894  const LO errCode =
2895  src->getLocalRowView (lclRow, lidsRaw, valsRaw, actualNumEnt);
2896 
2897  if (numEnt != static_cast<size_t> (actualNumEnt)) {
2898  std::ostream& err = this->markLocalErrorAndGetStream ();
2899  err << "packAndPrepare: Local row " << i << " claims to have " <<
2900  numEnt << "entry/ies, but the View returned by getLocalRowView() "
2901  "has " << actualNumEnt << " entry/ies. This should never happen. "
2902  "Please report this bug to the Tpetra developers." << endl;
2903  return;
2904  }
2905  if (errCode == Teuchos::OrdinalTraits<LO>::invalid ()) {
2906  std::ostream& err = this->markLocalErrorAndGetStream ();
2907  err << "packAndPrepare: Local row " << i << " is not in the row Map "
2908  "of the source object on the calling process." << endl;
2909  return;
2910  }
2911 
2912  const ST* valsRawST =
2913  const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2914  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2915 
2916  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2917  // space here for now, this doesn't assume UVM. That may change
2918  // in the future, if we ever start packing on the device.
2919  numBytesPerValue = PackTraits<ST, HES>::packValueCount (vals(0));
2920  }
2921 
2922  const size_t numBytes =
2923  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue, blockSize);
2924  numPacketsPerLID[i] = numBytes;
2925  totalNumBytes += numBytes;
2926  totalNumEntries += numEnt;
2927  maxRowLength = std::max (maxRowLength, numEnt);
2928  }
2929 
2930  if (debug) {
2931  const int myRank = graph_.getComm ()->getRank ();
2932  std::ostringstream os;
2933  os << "Proc " << myRank << ": packAndPrepare: totalNumBytes = "
2934  << totalNumBytes << endl;
2935  std::cerr << os.str ();
2936  }
2937 
2938  // We use a "struct of arrays" approach to packing each row's
2939  // entries. All the column indices (as global indices) go first,
2940  // then all their owning process ranks, and then the values.
2941  exports.resize (totalNumBytes);
2942  if (totalNumEntries > 0) {
2943  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
2944  totalNumBytes);
2945 
2946  // Current position (in bytes) in the 'exports' output array.
2947  size_t offset = 0;
2948 
2949  // For each block row of the matrix owned by the calling
2950  // process, pack that block row's column indices and values into
2951  // the exports array.
2952 
2953  // Source matrix's column Map. We verified in checkSizes() that
2954  // the column Map exists (is not null).
2955  const map_type& srcColMap = * (srcGraph.getColMap ());
2956 
2957  // Temporary buffer for global column indices.
2958  View<GO*, HES> gblColInds;
2959  {
2960  GO gid = 0;
2961  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
2962  }
2963 
2964  // Pack the data for each row to send, into the 'exports' buffer.
2965  for (size_type i = 0; i < numExportLIDs; ++i) {
2966  const LO lclRowInd = exportLIDs[i];
2967  const LO* lclColIndsRaw;
2968  Scalar* valsRaw;
2969  LO numEntLO;
2970  // It's OK to ignore the return value, since if the calling
2971  // process doesn't own that local row, then the number of
2972  // entries in that row on the calling process is zero.
2973  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
2974  const size_t numEnt = static_cast<size_t> (numEntLO);
2975  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2976  View<const LO*, HES, MemoryUnmanaged> lclColInds (lclColIndsRaw, numEnt);
2977  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2978  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2979 
2980  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2981  // space here for now, this doesn't assume UVM. That may
2982  // change in the future, if we ever start packing on device.
2983  const size_t numBytesPerValue = numEnt == 0 ?
2984  static_cast<size_t> (0) :
2985  PackTraits<ST, HES>::packValueCount (vals(0));
2986 
2987  // Convert column indices from local to global.
2988  for (size_t j = 0; j < numEnt; ++j) {
2989  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2990  }
2991 
2992  // Copy the row's data into the current spot in the exports array.
2993  const size_t numBytes =
2994  packRowForBlockCrs<ST, LO, GO, HES> (exportsK, offset, numEnt, gblColInds,
2995  vals, numBytesPerValue, blockSize);
2996  // Keep track of how many bytes we packed.
2997  offset += numBytes;
2998  } // for each LID (of a row) to send
2999 
3000  if (offset != totalNumBytes) {
3001  std::ostream& err = this->markLocalErrorAndGetStream ();
3002  err << "packAndPreapre: At end of method, the final offset (in bytes) "
3003  << offset << " does not equal the total number of bytes packed "
3004  << totalNumBytes << ". "
3005  << "Please report this bug to the Tpetra developers." << endl;
3006  return;
3007  }
3008  } // if totalNumEntries > 0
3009 
3010  if (debug) {
3011  std::ostringstream os;
3012  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3013  const bool lclSuccess = ! (* (this->localError_));
3014  os << "*** Proc " << myRank << ": packAndPrepare "
3015  << (lclSuccess ? "succeeded" : "FAILED")
3016  << " (totalNumEntries = " << totalNumEntries << ") ***" << endl;
3017  std::cerr << os.str ();
3018  }
3019  }
3020 
3021 
3022  template<class Scalar, class LO, class GO, class Node>
3023  void
3024  BlockCrsMatrix<Scalar, LO, GO, Node>::
3025  unpackAndCombine (const Teuchos::ArrayView<const LO>& importLIDs,
3026  const Teuchos::ArrayView<const packet_type>& imports,
3027  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
3028  size_t /* constantNumPackets */, // not worthwhile to use this
3029  ::Tpetra::Distributor& /* distor */,
3030  ::Tpetra::CombineMode CM)
3031  {
3032  using std::endl;
3033  using ::Tpetra::Details::PackTraits;
3034  using Kokkos::MemoryUnmanaged;
3035  using Kokkos::subview;
3036  using Kokkos::View;
3037  typedef typename ::Tpetra::MultiVector<Scalar, LO, GO, Node>::impl_scalar_type ST;
3038  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
3039  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
3040  typedef std::pair<typename View<int*, HES>::size_type,
3041  typename View<int*, HES>::size_type> pair_type;
3042  typedef View<GO*, HES, MemoryUnmanaged> gids_out_type;
3043  typedef View<LO*, HES, MemoryUnmanaged> lids_out_type;
3044  typedef View<ST*, HES, MemoryUnmanaged> vals_out_type;
3045  typedef typename PackTraits<GO, HES>::input_buffer_type input_buffer_type;
3046  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::unpackAndCombine: ";
3047  const bool debug = false;
3048 
3049  if (debug) {
3050  std::ostringstream os;
3051  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3052  os << "Proc " << myRank << ": unpackAndCombine" << endl;
3053  std::cerr << os.str ();
3054  }
3055 
3056  // It should not cause deadlock to return on error in this method,
3057  // since this method does not communicate.
3058 
3059  if (* (this->localError_)) {
3060  std::ostream& err = this->markLocalErrorAndGetStream ();
3061  err << prefix << "The target object of the Import or Export is "
3062  "already in an error state." << endl;
3063  return;
3064  }
3065  if (importLIDs.size () != numPacketsPerLID.size ()) {
3066  std::ostream& err = this->markLocalErrorAndGetStream ();
3067  err << prefix << "importLIDs.size() = " << importLIDs.size () << " != "
3068  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "." << endl;
3069  return;
3070  }
3071  if (CM != ADD && CM != INSERT && CM != REPLACE && CM != ABSMAX && CM != ZERO) {
3072  std::ostream& err = this->markLocalErrorAndGetStream ();
3073  err << prefix << "Invalid CombineMode value " << CM << ". Valid "
3074  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3075  << endl;
3076  return;
3077  }
3078 
3079  // Target matrix's column Map. Use to convert the global column
3080  // indices in the receive buffer to local indices. We verified in
3081  // checkSizes() that the column Map exists (is not null).
3082  const map_type& tgtColMap = * (this->graph_.getColMap ());
3083 
3084  const size_type numImportLIDs = importLIDs.size ();
3085  if (CM == ZERO || numImportLIDs == 0) {
3086  if (debug) {
3087  std::ostringstream os;
3088  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3089  os << "Proc " << myRank << ": unpackAndCombine: Nothing to do" << endl;
3090  std::cerr << os.str ();
3091  }
3092  return; // nothing to do; no need to combine entries
3093  }
3094 
3095  if (debug) {
3096  std::ostringstream os;
3097  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3098  os << "Proc " << myRank << ": unpackAndCombine: Getting ready" << endl;
3099  std::cerr << os.str ();
3100  }
3101 
3102  input_buffer_type importsK (imports.getRawPtr (), imports.size ());
3103  const size_t blockSize = this->getBlockSize ();
3104  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3105  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3106 
3107  // Determine the number of bytes in a Scalar instance.
3108  size_t numBytesPerValue;
3109 
3110  // mfh 19 Sep 2017: Only Stokhos has Scalar types with run-time
3111  // size. For all of those types, any one allocated instance has
3112  // the same size as any other allocated instance. Thus, it
3113  // suffices to find some allocated instance as a representative
3114  // value.
3115  if (this->val_.h_view.extent (0) != 0) {
3116  const ST& val = this->val_.h_view[0];
3117  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
3118  }
3119  else {
3120  // FIXME (mfh 19 Sep 2017): I don't have any values on my
3121  // process, so I don't know how big the value should be, if it's
3122  // run-time-sized. The best I can do is use a default-allocated
3123  // Scalar instance's size. If we ever want to fix this, then
3124  // each sending process should pack the run-time size. This is
3125  // only necessary if the size is not a compile-time constant.
3126  Scalar val;
3127  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
3128  }
3129 
3130  // Temporary space to cache incoming global column indices and
3131  // values. Column indices come in as global indices, in case the
3132  // source object's column Map differs from the target object's
3133  // (this's) column Map.
3134  View<GO*, HES> gblColInds;
3135  View<LO*, HES> lclColInds;
3136  View<ST*, HES> vals;
3137  {
3138  GO gid = 0;
3139  LO lid = 0;
3140  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
3141  // run-time size? We already assume that all entries in both
3142  // the source and target matrices have the same size. If the
3143  // calling process owns at least one entry in either matrix, we
3144  // can use that entry to set the size. However, it is possible
3145  // that the calling process owns no entries. In that case,
3146  // we're in trouble. One way to fix this would be for each
3147  // row's data to contain the run-time size. This is only
3148  // necessary if the size is not a compile-time constant.
3149  Scalar val;
3150  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowNumEnt, "gids");
3151  lclColInds = PackTraits<LO, HES>::allocateArray (lid, maxRowNumEnt, "lids");
3152  vals = PackTraits<ST, HES>::allocateArray (val, maxRowNumScalarEnt, "vals");
3153  }
3154 
3155  size_t offset = 0;
3156  bool errorDuringUnpack = false;
3157  for (size_type i = 0; i < numImportLIDs; ++i) {
3158  const size_t numBytes = numPacketsPerLID[i];
3159  if (numBytes == 0) {
3160  continue; // empty buffer for that row means that the row is empty
3161  }
3162  const size_t numEnt =
3163  unpackRowCount<ST, LO, GO, HES> (importsK, offset, numBytes,
3164  numBytesPerValue);
3165  if (numEnt > maxRowNumEnt) {
3166  errorDuringUnpack = true;
3167 #ifdef HAVE_TPETRA_DEBUG
3168  std::ostream& err = this->markLocalErrorAndGetStream ();
3169  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3170  << " > maxRowNumEnt = " << maxRowNumEnt << endl;
3171 #endif // HAVE_TPETRA_DEBUG
3172  continue;
3173  }
3174 
3175  const size_t numScalarEnt = numEnt * blockSize * blockSize;
3176  const LO lclRow = importLIDs[i];
3177 
3178  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
3179  vals_out_type valsOut = subview (vals, pair_type (0, numScalarEnt));
3180 
3181  const size_t numBytesOut =
3182  unpackRowForBlockCrs<ST, LO, GO, HES> (gidsOut, valsOut, importsK,
3183  offset, numBytes, numEnt,
3184  numBytesPerValue, blockSize);
3185  if (numBytes != numBytesOut) {
3186  errorDuringUnpack = true;
3187 #ifdef HAVE_TPETRA_DEBUG
3188  std::ostream& err = this->markLocalErrorAndGetStream ();
3189  err << prefix << "At i = " << i << ", numBytes = " << numBytes
3190  << " != numBytesOut = " << numBytesOut << ".";
3191 #endif // HAVE_TPETRA_DEBUG
3192  continue;
3193  }
3194 
3195  // Convert incoming global indices to local indices.
3196  lids_out_type lidsOut = subview (lclColInds, pair_type (0, numEnt));
3197  for (size_t k = 0; k < numEnt; ++k) {
3198  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3199  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3200  errorDuringUnpack = true;
3201 #ifdef HAVE_TPETRA_DEBUG
3202  std::ostream& err = this->markLocalErrorAndGetStream ();
3203  err << prefix << "At i = " << i << ", GID " << gidsOut(k)
3204  << " is not owned by the calling process.";
3205 #endif // HAVE_TPETRA_DEBUG
3206  continue;
3207  }
3208  }
3209 
3210  // Combine the incoming data with the matrix's current data.
3211  LO numCombd = 0;
3212  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
3213  const Scalar* const valsRaw =
3214  reinterpret_cast<const Scalar*> (const_cast<const ST*> (valsOut.data ()));
3215  if (CM == ADD) {
3216  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3217  } else if (CM == INSERT || CM == REPLACE) {
3218  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3219  } else if (CM == ABSMAX) {
3220  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3221  }
3222 
3223  if (static_cast<LO> (numEnt) != numCombd) {
3224  errorDuringUnpack = true;
3225 #ifdef HAVE_TPETRA_DEBUG
3226  std::ostream& err = this->markLocalErrorAndGetStream ();
3227  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3228  << " != numCombd = " << numCombd << ".";
3229 #endif // HAVE_TPETRA_DEBUG
3230  continue;
3231  }
3232 
3233  // Don't update offset until current LID has succeeded.
3234  offset += numBytes;
3235  } // for each import LID i
3236 
3237  if (errorDuringUnpack) {
3238  std::ostream& err = this->markLocalErrorAndGetStream ();
3239  err << prefix << "Unpacking failed.";
3240 #ifndef HAVE_TPETRA_DEBUG
3241  err << " Please run again with a debug build to get more verbose "
3242  "diagnostic output.";
3243 #endif // ! HAVE_TPETRA_DEBUG
3244  err << endl;
3245  }
3246 
3247  if (debug) {
3248  std::ostringstream os;
3249  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3250  const bool lclSuccess = ! (* (this->localError_));
3251  os << "*** Proc " << myRank << ": unpackAndCombine "
3252  << (lclSuccess ? "succeeded" : "FAILED")
3253  << " ***" << endl;
3254  std::cerr << os.str ();
3255  }
3256  }
3257 
3258 
3259  template<class Scalar, class LO, class GO, class Node>
3260  std::string
3262  {
3263  using Teuchos::TypeNameTraits;
3264  std::ostringstream os;
3265  os << "\"Tpetra::BlockCrsMatrix\": { "
3266  << "Template parameters: { "
3267  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3268  << "LO: " << TypeNameTraits<LO>::name ()
3269  << "GO: " << TypeNameTraits<GO>::name ()
3270  << "Node: " << TypeNameTraits<Node>::name ()
3271  << " }"
3272  << ", Label: \"" << this->getObjectLabel () << "\""
3273  << ", Global dimensions: ["
3274  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3275  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3276  << ", Block size: " << getBlockSize ()
3277  << " }";
3278  return os.str ();
3279  }
3280 
3281 
3282  template<class Scalar, class LO, class GO, class Node>
3283  void
3285  describe (Teuchos::FancyOStream& out,
3286  const Teuchos::EVerbosityLevel verbLevel) const
3287  {
3288  using Teuchos::ArrayRCP;
3289  using Teuchos::CommRequest;
3290  using Teuchos::FancyOStream;
3291  using Teuchos::getFancyOStream;
3292  using Teuchos::ireceive;
3293  using Teuchos::isend;
3294  using Teuchos::outArg;
3295  using Teuchos::TypeNameTraits;
3296  using Teuchos::VERB_DEFAULT;
3297  using Teuchos::VERB_NONE;
3298  using Teuchos::VERB_LOW;
3299  using Teuchos::VERB_MEDIUM;
3300  // using Teuchos::VERB_HIGH;
3301  using Teuchos::VERB_EXTREME;
3302  using Teuchos::RCP;
3303  using Teuchos::wait;
3304  using std::endl;
3305 #ifdef HAVE_TPETRA_DEBUG
3306  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::describe: ";
3307 #endif // HAVE_TPETRA_DEBUG
3308 
3309  // Set default verbosity if applicable.
3310  const Teuchos::EVerbosityLevel vl =
3311  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3312 
3313  if (vl == VERB_NONE) {
3314  return; // print nothing
3315  }
3316 
3317  // describe() always starts with a tab before it prints anything.
3318  Teuchos::OSTab tab0 (out);
3319 
3320  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3321  Teuchos::OSTab tab1 (out);
3322 
3323  out << "Template parameters:" << endl;
3324  {
3325  Teuchos::OSTab tab2 (out);
3326  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3327  << "LO: " << TypeNameTraits<LO>::name () << endl
3328  << "GO: " << TypeNameTraits<GO>::name () << endl
3329  << "Node: " << TypeNameTraits<Node>::name () << endl;
3330  }
3331  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3332  << "Global dimensions: ["
3333  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3334  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3335 
3336  const LO blockSize = getBlockSize ();
3337  out << "Block size: " << blockSize << endl;
3338 
3339  // constituent objects
3340  if (vl >= VERB_MEDIUM) {
3341  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3342  const int myRank = comm.getRank ();
3343  if (myRank == 0) {
3344  out << "Row Map:" << endl;
3345  }
3346  getRowMap()->describe (out, vl);
3347 
3348  if (! getColMap ().is_null ()) {
3349  if (getColMap() == getRowMap()) {
3350  if (myRank == 0) {
3351  out << "Column Map: same as row Map" << endl;
3352  }
3353  }
3354  else {
3355  if (myRank == 0) {
3356  out << "Column Map:" << endl;
3357  }
3358  getColMap ()->describe (out, vl);
3359  }
3360  }
3361  if (! getDomainMap ().is_null ()) {
3362  if (getDomainMap () == getRowMap ()) {
3363  if (myRank == 0) {
3364  out << "Domain Map: same as row Map" << endl;
3365  }
3366  }
3367  else if (getDomainMap () == getColMap ()) {
3368  if (myRank == 0) {
3369  out << "Domain Map: same as column Map" << endl;
3370  }
3371  }
3372  else {
3373  if (myRank == 0) {
3374  out << "Domain Map:" << endl;
3375  }
3376  getDomainMap ()->describe (out, vl);
3377  }
3378  }
3379  if (! getRangeMap ().is_null ()) {
3380  if (getRangeMap () == getDomainMap ()) {
3381  if (myRank == 0) {
3382  out << "Range Map: same as domain Map" << endl;
3383  }
3384  }
3385  else if (getRangeMap () == getRowMap ()) {
3386  if (myRank == 0) {
3387  out << "Range Map: same as row Map" << endl;
3388  }
3389  }
3390  else {
3391  if (myRank == 0) {
3392  out << "Range Map: " << endl;
3393  }
3394  getRangeMap ()->describe (out, vl);
3395  }
3396  }
3397  }
3398 
3399  if (vl >= VERB_EXTREME) {
3400  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3401  // to host, since it's supposed to be const. However, that's
3402  // the easiest and least memory-intensive way to implement this
3403  // method.
3405  const_cast<this_type*> (this)->template sync<Kokkos::HostSpace> ();
3406 
3407 #ifdef HAVE_TPETRA_DEBUG
3408  TEUCHOS_TEST_FOR_EXCEPTION
3409  (this->template need_sync<Kokkos::HostSpace> (), std::logic_error,
3410  prefix << "Right after sync to host, the matrix claims that it needs "
3411  "sync to host. Please report this bug to the Tpetra developers.");
3412 #endif // HAVE_TPETRA_DEBUG
3413 
3414  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3415  const int myRank = comm.getRank ();
3416  const int numProcs = comm.getSize ();
3417 
3418  // Print the calling process' data to the given output stream.
3419  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3420  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3421  FancyOStream& os = *osPtr;
3422  os << "Process " << myRank << ":" << endl;
3423  Teuchos::OSTab tab2 (os);
3424 
3425  const map_type& meshRowMap = * (graph_.getRowMap ());
3426  const map_type& meshColMap = * (graph_.getColMap ());
3427  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3428  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3429  ++meshLclRow) {
3430  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3431  os << "Row " << meshGblRow << ": {";
3432 
3433  const LO* lclColInds = NULL;
3434  Scalar* vals = NULL;
3435  LO numInds = 0;
3436  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3437 
3438  for (LO k = 0; k < numInds; ++k) {
3439  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3440 
3441  os << "Col " << gblCol << ": [";
3442  for (LO i = 0; i < blockSize; ++i) {
3443  for (LO j = 0; j < blockSize; ++j) {
3444  os << vals[blockSize*blockSize*k + i*blockSize + j];
3445  if (j + 1 < blockSize) {
3446  os << ", ";
3447  }
3448  }
3449  if (i + 1 < blockSize) {
3450  os << "; ";
3451  }
3452  }
3453  os << "]";
3454  if (k + 1 < numInds) {
3455  os << ", ";
3456  }
3457  }
3458  os << "}" << endl;
3459  }
3460 
3461  // Print data on Process 0. This will automatically respect the
3462  // current indentation level.
3463  if (myRank == 0) {
3464  out << lclOutStrPtr->str ();
3465  lclOutStrPtr = Teuchos::null; // clear it to save space
3466  }
3467 
3468  const int sizeTag = 1337;
3469  const int dataTag = 1338;
3470 
3471  ArrayRCP<char> recvDataBuf; // only used on Process 0
3472 
3473  // Send string sizes and data from each process in turn to
3474  // Process 0, and print on that process.
3475  for (int p = 1; p < numProcs; ++p) {
3476  if (myRank == 0) {
3477  // Receive the incoming string's length.
3478  ArrayRCP<size_t> recvSize (1);
3479  recvSize[0] = 0;
3480  RCP<CommRequest<int> > recvSizeReq =
3481  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3482  wait<int> (comm, outArg (recvSizeReq));
3483  const size_t numCharsToRecv = recvSize[0];
3484 
3485  // Allocate space for the string to receive. Reuse receive
3486  // buffer space if possible. We can do this because in the
3487  // current implementation, we only have one receive in
3488  // flight at a time. Leave space for the '\0' at the end,
3489  // in case the sender doesn't send it.
3490  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3491  recvDataBuf.resize (numCharsToRecv + 1);
3492  }
3493  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3494  // Post the receive of the actual string data.
3495  RCP<CommRequest<int> > recvDataReq =
3496  ireceive<int, char> (recvData, p, dataTag, comm);
3497  wait<int> (comm, outArg (recvDataReq));
3498 
3499  // Print the received data. This will respect the current
3500  // indentation level. Make sure that the string is
3501  // null-terminated.
3502  recvDataBuf[numCharsToRecv] = '\0';
3503  out << recvDataBuf.getRawPtr ();
3504  }
3505  else if (myRank == p) { // if I am not Process 0, and my rank is p
3506  // This deep-copies the string at most twice, depending on
3507  // whether std::string reference counts internally (it
3508  // generally does, so this won't deep-copy at all).
3509  const std::string stringToSend = lclOutStrPtr->str ();
3510  lclOutStrPtr = Teuchos::null; // clear original to save space
3511 
3512  // Send the string's length to Process 0.
3513  const size_t numCharsToSend = stringToSend.size ();
3514  ArrayRCP<size_t> sendSize (1);
3515  sendSize[0] = numCharsToSend;
3516  RCP<CommRequest<int> > sendSizeReq =
3517  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3518  wait<int> (comm, outArg (sendSizeReq));
3519 
3520  // Send the actual string to Process 0. We know that the
3521  // string has length > 0, so it's save to take the address
3522  // of the first entry. Make a nonowning ArrayRCP to hold
3523  // the string. Process 0 will add a null termination
3524  // character at the end of the string, after it receives the
3525  // message.
3526  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3527  RCP<CommRequest<int> > sendDataReq =
3528  isend<int, char> (sendData, 0, dataTag, comm);
3529  wait<int> (comm, outArg (sendDataReq));
3530  }
3531  } // for each process rank p other than 0
3532  } // extreme verbosity level (print the whole matrix)
3533  }
3534 
3535  template<class Scalar, class LO, class GO, class Node>
3536  Teuchos::RCP<const Teuchos::Comm<int> >
3538  getComm() const
3539  {
3540  return graph_.getComm();
3541  }
3542 
3543  template<class Scalar, class LO, class GO, class Node>
3544  Teuchos::RCP<Node>
3546  getNode() const
3547  {
3548  return graph_.getNode();
3549 
3550  }
3551 
3552  template<class Scalar, class LO, class GO, class Node>
3556  {
3557  return graph_.getGlobalNumCols();
3558  }
3559 
3560  template<class Scalar, class LO, class GO, class Node>
3561  size_t
3564  {
3565  return graph_.getNodeNumCols();
3566  }
3567 
3568  template<class Scalar, class LO, class GO, class Node>
3569  GO
3572  {
3573  return graph_.getIndexBase();
3574  }
3575 
3576  template<class Scalar, class LO, class GO, class Node>
3580  {
3581  return graph_.getGlobalNumEntries();
3582  }
3583 
3584  template<class Scalar, class LO, class GO, class Node>
3585  size_t
3588  {
3589  return graph_.getNodeNumEntries();
3590  }
3591 
3592  template<class Scalar, class LO, class GO, class Node>
3593  size_t
3595  getNumEntriesInGlobalRow (GO globalRow) const
3596  {
3597  return graph_.getNumEntriesInGlobalRow(globalRow);
3598  }
3599 
3600  template<class Scalar, class LO, class GO, class Node>
3604  {
3606  return dynamic_cast<const HDM&> (this->graph_).getGlobalNumDiagsImpl ();
3607  }
3608 
3609  template<class Scalar, class LO, class GO, class Node>
3610  size_t
3613  {
3615  return dynamic_cast<const HDM&> (this->graph_).getNodeNumDiagsImpl ();
3616  }
3617 
3618  template<class Scalar, class LO, class GO, class Node>
3619  size_t
3622  {
3623  return graph_.getGlobalMaxNumRowEntries();
3624  }
3625 
3626  template<class Scalar, class LO, class GO, class Node>
3627  bool
3629  hasColMap() const
3630  {
3631  return graph_.hasColMap();
3632  }
3633 
3634  template<class Scalar, class LO, class GO, class Node>
3635  bool
3638  {
3640  return dynamic_cast<const HDM&> (this->graph_).isLowerTriangularImpl ();
3641  }
3642 
3643  template<class Scalar, class LO, class GO, class Node>
3644  bool
3647  {
3649  return dynamic_cast<const HDM&> (this->graph_).isUpperTriangularImpl ();
3650  }
3651 
3652  template<class Scalar, class LO, class GO, class Node>
3653  bool
3656  {
3657  return graph_.isLocallyIndexed();
3658  }
3659 
3660  template<class Scalar, class LO, class GO, class Node>
3661  bool
3664  {
3665  return graph_.isGloballyIndexed();
3666  }
3667 
3668  template<class Scalar, class LO, class GO, class Node>
3669  bool
3672  {
3673  return graph_.isFillComplete ();
3674  }
3675 
3676  template<class Scalar, class LO, class GO, class Node>
3677  bool
3680  {
3681  return false;
3682  }
3683 
3684 
3685  template<class Scalar, class LO, class GO, class Node>
3686  void
3688  getGlobalRowCopy (GO GlobalRow,
3689  const Teuchos::ArrayView<GO> &Indices,
3690  const Teuchos::ArrayView<Scalar> &Values,
3691  size_t &NumEntries) const
3692  {
3693  TEUCHOS_TEST_FOR_EXCEPTION(
3694  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
3695  "This class doesn't support global matrix indexing.");
3696 
3697  }
3698 
3699  template<class Scalar, class LO, class GO, class Node>
3700  void
3702  getGlobalRowView (GO GlobalRow,
3703  Teuchos::ArrayView<const GO> &indices,
3704  Teuchos::ArrayView<const Scalar> &values) const
3705  {
3706  TEUCHOS_TEST_FOR_EXCEPTION(
3707  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
3708  "This class doesn't support global matrix indexing.");
3709 
3710  }
3711 
3712  template<class Scalar, class LO, class GO, class Node>
3713  void
3715  getLocalRowView (LO LocalRow,
3716  Teuchos::ArrayView<const LO>& indices,
3717  Teuchos::ArrayView<const Scalar>& values) const
3718  {
3719  TEUCHOS_TEST_FOR_EXCEPTION(
3720  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: "
3721  "This class doesn't support local matrix indexing.");
3722  }
3723 
3724  template<class Scalar, class LO, class GO, class Node>
3725  void
3728  {
3729 #ifdef HAVE_TPETRA_DEBUG
3730  const char prefix[] =
3731  "Tpetra::Experimental::BlockCrsMatrix::getLocalDiagCopy: ";
3732 #endif // HAVE_TPETRA_DEBUG
3733 
3734  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3735 
3736  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3737  graph_.getLocalDiagOffsets (diagOffsets);
3738 
3739  // The code below works on host, so use a host View.
3740  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3741  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3742  // We're filling diag on host for now.
3743  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3744 
3745 #ifdef HAVE_TPETRA_DEBUG
3746  TEUCHOS_TEST_FOR_EXCEPTION
3747  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
3748  prefix << "The matrix's data were last modified on device, but have "
3749  "not been sync'd to host. Please sync to host (by calling "
3750  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3751  "method.");
3752 #endif // HAVE_TPETRA_DEBUG
3753 
3754  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
3755  // version of the data always exists (no lazy allocation for host
3756  // data).
3758  auto vals_host_out =
3759  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
3760  Scalar* vals_host_out_raw =
3761  reinterpret_cast<Scalar*> (vals_host_out.data ());
3762 
3763  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3764  size_t rowOffset = 0;
3765  size_t offset = 0;
3766  LO bs = getBlockSize();
3767  for(size_t r=0; r<getNodeNumRows(); r++)
3768  {
3769  // move pointer to start of diagonal block
3770  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3771  for(int b=0; b<bs; b++)
3772  {
3773  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3774  }
3775  // move pointer to start of next block row
3776  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3777  }
3778 
3779  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3780  }
3781 
3782  template<class Scalar, class LO, class GO, class Node>
3783  void
3785  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& x)
3786  {
3787  TEUCHOS_TEST_FOR_EXCEPTION(
3788  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
3789  "not implemented.");
3790 
3791  }
3792 
3793  template<class Scalar, class LO, class GO, class Node>
3794  void
3796  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& x)
3797  {
3798  TEUCHOS_TEST_FOR_EXCEPTION(
3799  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
3800  "not implemented.");
3801 
3802  }
3803 
3804  template<class Scalar, class LO, class GO, class Node>
3805  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3807  getGraph() const
3808  {
3809  return graphRCP_;
3810  }
3811 
3812  template<class Scalar, class LO, class GO, class Node>
3813  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3816  {
3817  TEUCHOS_TEST_FOR_EXCEPTION(
3818  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
3819  "not implemented.");
3820  }
3821 
3822 } // namespace Classes
3823 } // namespace Experimental
3824 } // namespace Tpetra
3825 
3826 //
3827 // Explicit instantiation macro
3828 //
3829 // Must be expanded from within the Tpetra namespace!
3830 //
3831 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3832  namespace Experimental { \
3833  namespace Classes { \
3834  template class BlockCrsMatrix< S, LO, GO, NODE >; \
3835  } \
3836  }
3837 
3838 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
Tpetra::Experimental::Classes::BlockMultiVector::scale
void scale(const Scalar &val)
Multiply all entries in place by the given value val.
Definition: Tpetra_Experimental_BlockMultiVector_def.hpp:648
Tpetra::Classes::CrsGraph::getRangeMap
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Definition: Tpetra_CrsGraph_def.hpp:931
Tpetra::Experimental::Classes::BlockCrsMatrix::describe
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3285
Tpetra::Experimental::Classes::BlockCrsMatrix::getNodeNumCols
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3563
Tpetra::ESweepDirection
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Definition: Tpetra_ConfigDefs.hpp:245
Tpetra::Experimental::Classes::BlockCrsMatrix::replaceLocalValues
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:981
Tpetra::Experimental::Classes::BlockCrsMatrix::sumIntoLocalValuesByOffsets
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1676
Tpetra::Experimental::Classes::BlockCrsMatrix::isFillComplete
virtual bool isFillComplete() const
Whether fillComplete() has been called.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3671
Tpetra::Experimental::Classes::BlockCrsMatrix::getGlobalNumEntries
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3579
Tpetra::Experimental::Classes::BlockMultiVector::getMultiVectorView
mv_type getMultiVectorView() const
Get a Tpetra::MultiVector that views this BlockMultiVector's data.
Definition: Tpetra_Experimental_BlockMultiVector_def.hpp:104
Tpetra::Experimental::Classes::BlockCrsMatrix::absMaxLocalValues
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1371
Tpetra::Experimental::Classes::BlockMultiVector::putScalar
void putScalar(const Scalar &val)
Fill all entries with the given value val.
Definition: Tpetra_Experimental_BlockMultiVector_def.hpp:641
Tpetra::Experimental::FILL
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
Definition: Tpetra_Experimental_BlockView.hpp:684
Tpetra::Experimental::Classes::BlockMultiVector::getNumVectors
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
Definition: Tpetra_Experimental_BlockMultiVector_decl.hpp:329
Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers
Mix-in to avoid spurious deprecation warnings due to #2630.
Definition: Tpetra_CrsGraph_decl.hpp:180
Tpetra::Experimental::Classes::BlockCrsMatrix::const_little_block_type
Kokkos::View< const impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
Definition: Tpetra_Experimental_BlockCrsMatrix_decl.hpp:201
Tpetra::Classes::Map::getGlobalElement
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
Definition: Tpetra_Map_def.hpp:1114
Tpetra::Experimental::Classes::BlockCrsMatrix::getGlobalMaxNumRowEntries
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries across all rows/columns on all nodes.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3621
Tpetra::Classes::CrsGraph::getDomainMap
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Definition: Tpetra_CrsGraph_def.hpp:922
Tpetra::Experimental::Classes::BlockCrsMatrix::description
std::string description() const
One-line description of this object.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3261
Tpetra::Experimental::AXPY
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Definition: Tpetra_Experimental_BlockView.hpp:702
Tpetra::Experimental::Classes::BlockCrsMatrix::getGraph
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3807
Tpetra::REPLACE
Replace existing values with new values.
Definition: Tpetra_CombineMode.hpp:97
Tpetra::Experimental::Classes::BlockCrsMatrix::getRowMap
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:809
Tpetra::Experimental::Classes::BlockCrsMatrix::getLocalRowView
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1491
Tpetra::Experimental::Classes::BlockCrsMatrix::replaceLocalValuesByOffsets
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1598
Tpetra::Experimental::Classes::BlockCrsMatrix::apply
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:849
Tpetra::Experimental::Classes::BlockCrsMatrix::getGlobalRowView
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3702
Tpetra::Experimental::Classes::BlockCrsMatrix::isUpperTriangular
virtual bool TPETRA_DEPRECATED isUpperTriangular() const
Whether the matrix's graph is locally upper triangular.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3646
Tpetra::Classes::Vector::replaceLocalValue
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
Definition: Tpetra_Vector_def.hpp:142
Tpetra::Experimental::Classes::BlockCrsMatrix::little_vec_type
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
Definition: Tpetra_Experimental_BlockCrsMatrix_decl.hpp:207
Tpetra::Experimental::Classes::BlockCrsMatrix::leftScale
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3785
Tpetra::Experimental::Classes::BlockCrsMatrix::getLocalRowCopy
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1539
Tpetra::Experimental::Classes::BlockCrsMatrix::getIndexBase
virtual GO getIndexBase() const
The index base for global indices in this matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3571
Tpetra_Details_PackTraits.hpp
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Tpetra::Experimental::Classes::BlockCrsMatrix::getNodeMaxNumRowEntries
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in any row of the matrix, on this process.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:841
Tpetra::Classes::CrsGraph::getLocalGraph
local_graph_type getLocalGraph() const
Get the local graph.
Definition: Tpetra_CrsGraph_def.hpp:4766
Tpetra::Experimental::Classes::BlockCrsMatrix
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
Definition: Tpetra_Experimental_BlockCrsMatrix_decl.hpp:138
Tpetra::Experimental::Classes::BlockCrsMatrix::localGaussSeidel
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1105
Tpetra::Experimental::Classes::BlockCrsMatrix::setAllToScalar
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:942
Tpetra::ZERO
Replace old values with zero.
Definition: Tpetra_CombineMode.hpp:99
Tpetra::Experimental::Classes::BlockCrsMatrix::getComm
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3538
Tpetra::Classes::CrsGraph::getNodeNumEntries
size_t getNodeNumEntries() const override
The local number of entries in the graph.
Definition: Tpetra_CrsGraph_def.hpp:1014
Tpetra::Classes::DistObject::doExport
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, CombineMode CM)
Export data into this object using an Export object ("forward mode").
Definition: Tpetra_DistObject_def.hpp:294
Tpetra::Experimental::Classes::BlockCrsMatrix::getColMap
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:817
Tpetra::Experimental::Classes::BlockMultiVector::getLocalBlock
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
Definition: Tpetra_Experimental_BlockMultiVector_def.hpp:425
Tpetra::ADD
Sum new values into existing values.
Definition: Tpetra_CombineMode.hpp:95
Tpetra::Experimental::Classes::BlockMultiVector::getBlockSize
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Definition: Tpetra_Experimental_BlockMultiVector_decl.hpp:324
Tpetra::Experimental::Classes::BlockCrsMatrix::BlockCrsMatrix
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:658
Tpetra::Classes::CrsGraph::isSorted
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Definition: Tpetra_CrsGraph_def.hpp:1246
Tpetra::Experimental::Classes::BlockCrsMatrix::little_block_type
Kokkos::View< impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Definition: Tpetra_Experimental_BlockCrsMatrix_decl.hpp:195
Tpetra::ABSMAX
Replace old value with maximum of magnitudes of old and new values.
Definition: Tpetra_CombineMode.hpp:98
Tpetra::Experimental::Classes::BlockCrsMatrix::impl_scalar_type
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_decl.hpp:164
Tpetra::Experimental::Classes::BlockCrsMatrix::getNodeNumDiags
virtual size_t TPETRA_DEPRECATED getNodeNumDiags() const
Number of diagonal entries in the matrix's graph, on the calling process.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3612
Tpetra::Classes::DistObject
Base class for distributed Tpetra objects that support data redistribution.
Definition: Tpetra_DistObject_decl.hpp:349
Tpetra::Experimental::Classes::BlockCrsMatrix::rightScale
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3796
Tpetra::Experimental::Classes::BlockCrsMatrix::getNodeNumRows
size_t getNodeNumRows() const
get the local number of block rows
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:833
Tpetra::Experimental::Classes::BlockCrsMatrix::hasColMap
virtual bool hasColMap() const
Whether this matrix has a well-defined column map.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3629
Tpetra::Distributor
Sets up and executes a communication plan for a Tpetra DistObject.
Definition: Tpetra_Distributor.hpp:188
Tpetra::Experimental::Classes::BlockMultiVector::makePointMap
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Definition: Tpetra_Experimental_BlockMultiVector_def.hpp:262
Tpetra::Experimental::Classes::BlockCrsMatrix::getFrobeniusNorm
virtual typename ::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3815
Tpetra::Classes::Map::getMaxLocalIndex
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
Definition: Tpetra_Map_decl.hpp:610
Tpetra::Experimental::COPY
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
Definition: Tpetra_Experimental_BlockView.hpp:728
Tpetra::Classes::MultiVector
One or more distributed dense vectors.
Definition: Tpetra_MultiVector_decl.hpp:389
Tpetra::Details::DefaultTypes::local_ordinal_type
int local_ordinal_type
Default value of Scalar template parameter.
Definition: Tpetra_Details_DefaultTypes.hpp:72
Tpetra::Experimental::Classes::BlockCrsMatrix::supportsRowViews
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3679
Tpetra::Experimental::Classes::BlockCrsMatrix::getGlobalNumDiags
virtual global_size_t TPETRA_DEPRECATED getGlobalNumDiags() const
Number of diagonal entries in the matrix's graph, over all processes in the matrix's communicator.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3603
Tpetra::Experimental::Classes::BlockCrsMatrix::gaussSeidelCopy
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1216
Tpetra::Experimental::Classes::BlockCrsMatrix::getNodeNumEntries
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3587
Tpetra::Experimental::Classes::BlockCrsMatrix::getGlobalRowCopy
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row's entries.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3688
Tpetra::Experimental::Classes::BlockCrsMatrix::getNumEntriesInGlobalRow
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3595
Tpetra::Experimental::Classes::BlockCrsMatrix::getDomainMap
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:791
Tpetra::Experimental::Classes::BlockCrsMatrix::isLocallyIndexed
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3655
Tpetra::Experimental::Classes::BlockCrsMatrix::isLowerTriangular
virtual bool TPETRA_DEPRECATED isLowerTriangular() const
Whether the matrix's graph is locally lower triangular.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3637
Tpetra::Experimental::Classes::BlockCrsMatrix::device_type
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
Definition: Tpetra_Experimental_BlockCrsMatrix_decl.hpp:177
Tpetra::Experimental::Classes::BlockMultiVector
MultiVector for multiple degrees of freedom per mesh point.
Definition: Tpetra_Experimental_BlockMultiVector_decl.hpp:147
Tpetra::Classes::Map
A parallel distribution of indices over processes.
Definition: Tpetra_Map_decl.hpp:247
Tpetra::Classes::DistObject::execution_space
device_type::execution_space execution_space
The Kokkos execution space.
Definition: Tpetra_DistObject_decl.hpp:372
Tpetra::Experimental::Classes::BlockCrsMatrix::absMaxLocalValuesByOffsets
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1637
Tpetra_Experimental_BlockCrsMatrix_decl.hpp
Declaration of Tpetra::Experimental::BlockCrsMatrix.
Tpetra::Experimental::SCAL
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Definition: Tpetra_Experimental_BlockView.hpp:672
Tpetra::Experimental::Classes::BlockCrsMatrix::getLocalDiagOffsets
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1058
Tpetra::Experimental::Classes::BlockCrsMatrix::applyBlock
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:914
Tpetra::Classes::CrsGraph::local_graph_type
Kokkos::StaticCrsGraph< LocalOrdinal, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Definition: Tpetra_CrsGraph_decl.hpp:292
Tpetra::Experimental::Classes::BlockCrsMatrix::getLocalDiagCopy
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1284
Tpetra::Experimental::Classes::BlockCrsMatrix::reorderedGaussSeidelCopy
void reorderedGaussSeidelCopy(::Tpetra::MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1234
Tpetra::global_size_t
size_t global_size_t
Global size_t object.
Definition: Tpetra_ConfigDefs.hpp:109
Tpetra::Classes::Map::getMinLocalIndex
LocalOrdinal getMinLocalIndex() const
The minimum local index.
Definition: Tpetra_Map_decl.hpp:596
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Tpetra::deep_copy
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Definition: Tpetra_MultiVector_decl.hpp:2453
Tpetra::Experimental::Classes::BlockCrsMatrix::sumIntoLocalValues
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1414
Tpetra::Experimental::Classes::BlockCrsMatrix::getGlobalNumRows
global_size_t getGlobalNumRows() const
get the global number of block rows
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:825
Tpetra::Experimental::Classes::BlockCrsMatrix::getRangeMap
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:800
Tpetra::Experimental::GEMV
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
Definition: Tpetra_Experimental_BlockView.hpp:754
Tpetra::Experimental::Classes::BlockCrsMatrix::getGlobalNumCols
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3555
Tpetra::Experimental::Classes::BlockCrsMatrix::getLocalRowOffsets
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1565
Tpetra::Experimental::Classes::BlockCrsMatrix::memory_space
device_type::memory_space memory_space
The Kokkos memory space that this class uses.
Definition: Tpetra_Experimental_BlockCrsMatrix_decl.hpp:181
Tpetra::INSERT
Insert new values that don't currently exist.
Definition: Tpetra_CombineMode.hpp:96
Tpetra::Classes::CrsGraph
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Definition: Tpetra_CrsGraph_decl.hpp:259
Tpetra::Experimental::Classes::BlockCrsMatrix::isGloballyIndexed
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3663
Tpetra::Experimental::Classes::BlockCrsMatrix::getNumEntriesInLocalRow
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:1717
Tpetra::Experimental::Classes::BlockCrsMatrix::getNode
virtual Teuchos::RCP< Node > getNode() const
The Kokkos Node instance.
Definition: Tpetra_Experimental_BlockCrsMatrix_def.hpp:3546
Tpetra::CombineMode
CombineMode
Rule for combining data in an Import or Export.
Definition: Tpetra_CombineMode.hpp:94
Tpetra::Classes::Vector
A distributed dense vector.
Definition: Tpetra_Vector_decl.hpp:82