Tpetra parallel linear algebra  Version of the Day
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
52 #include "Tpetra_Details_radixSort.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
56 #include "Tpetra_Import_Util.hpp"
57 #include "Tpetra_Import_Util2.hpp"
58 #include <algorithm>
59 #include <type_traits>
60 #include "Teuchos_FancyOStream.hpp"
61 
62 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
64 
65 #include "KokkosSparse_spgemm.hpp"
66 #include "KokkosSparse_spadd.hpp"
67 
68 #include <MatrixMarket_Tpetra.hpp>
69 
77 /*********************************************************************************************************/
78 // Include the architecture-specific kernel partial specializations here
79 // NOTE: This needs to be outside all namespaces
80 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
81 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
82 
83 
84 namespace Tpetra {
85 
86 namespace MatrixMatrix{
87 
88 //
89 // This method forms the matrix-matrix product C = op(A) * op(B), where
90 // op(A) == A if transposeA is false,
91 // op(A) == A^T if transposeA is true,
92 // and similarly for op(B).
93 //
94 template <class Scalar,
95  class LocalOrdinal,
96  class GlobalOrdinal,
97  class Node>
98 void Multiply(
100  bool transposeA,
102  bool transposeB,
104  bool call_FillComplete_on_result,
105  const std::string& label,
106  const Teuchos::RCP<Teuchos::ParameterList>& params)
107 {
108  using Teuchos::null;
109  using Teuchos::RCP;
110  typedef Scalar SC;
111  typedef LocalOrdinal LO;
112  typedef GlobalOrdinal GO;
113  typedef Node NO;
114  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
115  typedef Import<LO,GO,NO> import_type;
116  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
117  typedef Map<LO,GO,NO> map_type;
118  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
119 
120 #ifdef HAVE_TPETRA_MMM_TIMINGS
121  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
122  using Teuchos::TimeMonitor;
123  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
124 #endif
125 
126  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
127 
128  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
129 
130  // The input matrices A and B must both be fillComplete.
131  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
132  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
133 
134  // If transposeA is true, then Aprime will be the transpose of A
135  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
136  // will just be a pointer to A.
137  RCP<const crs_matrix_type> Aprime = null;
138  // If transposeB is true, then Bprime will be the transpose of B
139  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
140  // will just be a pointer to B.
141  RCP<const crs_matrix_type> Bprime = null;
142 
143  // Is this a "clean" matrix?
144  //
145  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
146  // locally nor globally indexed, then it was empty. I don't like
147  // this, because the most straightforward implementation presumes
148  // lazy allocation of indices. However, historical precedent
149  // demands that we keep around this predicate as a way to test
150  // whether the matrix is empty.
151  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
152 
153  bool use_optimized_ATB = false;
154  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
155  use_optimized_ATB = true;
156 
157 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
158  use_optimized_ATB = false;
159 #endif
160 
161  if (!use_optimized_ATB && transposeA) {
162  transposer_type transposer(rcpFromRef (A));
163  Aprime = transposer.createTranspose();
164 
165  } else {
166  Aprime = rcpFromRef(A);
167  }
168 
169  if (transposeB) {
170  transposer_type transposer(rcpFromRef (B));
171  Bprime = transposer.createTranspose();
172 
173  } else {
174  Bprime = rcpFromRef(B);
175  }
176 
177  // Check size compatibility
178  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
179  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
180  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
181  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
182  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
183  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
184  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
185  prefix << "ERROR, inner dimensions of op(A) and op(B) "
186  "must match for matrix-matrix product. op(A) is "
187  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
188 
189  // The result matrix C must at least have a row-map that reflects the correct
190  // row-size. Don't check the number of columns because rectangular matrices
191  // which were constructed with only one map can still end up having the
192  // correct capacity and dimensions when filled.
193  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
194  prefix << "ERROR, dimensions of result C must "
195  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
196  << " rows, should have at least " << Aouter << std::endl);
197 
198  // It doesn't matter whether C is already Filled or not. If it is already
199  // Filled, it must have space allocated for the positions that will be
200  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
201  // we'll error out later when trying to store result values.
202 
203  // CGB: However, matrix must be in active-fill
204  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
205 
206  // We're going to need to import remotely-owned sections of A and/or B if
207  // more than one processor is performing this run, depending on the scenario.
208  int numProcs = A.getComm()->getSize();
209 
210  // Declare a couple of structs that will be used to hold views of the data
211  // of A and B, to be used for fast access during the matrix-multiplication.
212  crs_matrix_struct_type Aview;
213  crs_matrix_struct_type Bview;
214 
215  RCP<const map_type> targetMap_A = Aprime->getRowMap();
216  RCP<const map_type> targetMap_B = Bprime->getRowMap();
217 
218 #ifdef HAVE_TPETRA_MMM_TIMINGS
219  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X"))));
220 #endif
221 
222  // Now import any needed remote rows and populate the Aview struct
223  // NOTE: We assert that an import isn't needed --- since we do the transpose
224  // above to handle that.
225  if (!use_optimized_ATB) {
226  RCP<const import_type> dummyImporter;
227  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
228  }
229 
230  // We will also need local access to all rows of B that correspond to the
231  // column-map of op(A).
232  if (numProcs > 1)
233  targetMap_B = Aprime->getColMap();
234 
235  // Import any needed remote rows and populate the Bview struct.
236  if (!use_optimized_ATB)
237  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label, params);
238 
239 #ifdef HAVE_TPETRA_MMM_TIMINGS
240  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
241 #endif
242 
243  // Call the appropriate method to perform the actual multiplication.
244  if (use_optimized_ATB) {
245  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
246 
247  } else if (call_FillComplete_on_result && newFlag) {
248  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
249 
250  } else if (call_FillComplete_on_result) {
251  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
252 
253  } else {
254  // mfh 27 Sep 2016: Is this the "slow" case? This
255  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
256  // thread-parallel inserts, but that may take some effort.
257  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
258 
259  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
260 
261 #ifdef HAVE_TPETRA_MMM_TIMINGS
262  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete"))));
263 #endif
264  if (call_FillComplete_on_result) {
265  // We'll call FillComplete on the C matrix before we exit, and give it a
266  // domain-map and a range-map.
267  // The domain-map will be the domain-map of B, unless
268  // op(B)==transpose(B), in which case the range-map of B will be used.
269  // The range-map will be the range-map of A, unless op(A)==transpose(A),
270  // in which case the domain-map of A will be used.
271  if (!C.isFillComplete())
272  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
273  }
274  }
275 }
276 
277 
278 template <class Scalar,
279  class LocalOrdinal,
280  class GlobalOrdinal,
281  class Node>
282 void Jacobi(Scalar omega,
287  bool call_FillComplete_on_result,
288  const std::string& label,
289  const Teuchos::RCP<Teuchos::ParameterList>& params)
290 {
291  using Teuchos::RCP;
292  typedef Scalar SC;
293  typedef LocalOrdinal LO;
294  typedef GlobalOrdinal GO;
295  typedef Node NO;
296  typedef Import<LO,GO,NO> import_type;
297  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
298  typedef Map<LO,GO,NO> map_type;
299  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
300 
301 #ifdef HAVE_TPETRA_MMM_TIMINGS
302  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
303  using Teuchos::TimeMonitor;
304  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup"))));
305 #endif
306 
307  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
308 
309  // A and B should already be Filled.
310  // Should we go ahead and call FillComplete() on them if necessary or error
311  // out? For now, we choose to error out.
312  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
313  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
314 
315  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
316  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
317 
318  // Now check size compatibility
319  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
320  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
321  global_size_t Aouter = A.getGlobalNumRows();
322  global_size_t Bouter = numBCols;
323  global_size_t Ainner = numACols;
324  global_size_t Binner = B.getGlobalNumRows();
325  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
326  prefix << "ERROR, inner dimensions of op(A) and op(B) "
327  "must match for matrix-matrix product. op(A) is "
328  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
329 
330  // The result matrix C must at least have a row-map that reflects the correct
331  // row-size. Don't check the number of columns because rectangular matrices
332  // which were constructed with only one map can still end up having the
333  // correct capacity and dimensions when filled.
334  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
335  prefix << "ERROR, dimensions of result C must "
336  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
337  << " rows, should have at least "<< Aouter << std::endl);
338 
339  // It doesn't matter whether C is already Filled or not. If it is already
340  // Filled, it must have space allocated for the positions that will be
341  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
342  // we'll error out later when trying to store result values.
343 
344  // CGB: However, matrix must be in active-fill
345  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
346 
347  // We're going to need to import remotely-owned sections of A and/or B if
348  // more than one processor is performing this run, depending on the scenario.
349  int numProcs = A.getComm()->getSize();
350 
351  // Declare a couple of structs that will be used to hold views of the data of
352  // A and B, to be used for fast access during the matrix-multiplication.
353  crs_matrix_struct_type Aview;
354  crs_matrix_struct_type Bview;
355 
356  RCP<const map_type> targetMap_A = Aprime->getRowMap();
357  RCP<const map_type> targetMap_B = Bprime->getRowMap();
358 
359 #ifdef HAVE_TPETRA_MMM_TIMINGS
360  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X"))));
361 #endif
362 
363  // Enable globalConstants by default
364  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
365  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
366  if(!params.is_null()) importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
367 
368  //Now import any needed remote rows and populate the Aview struct.
369  RCP<const import_type> dummyImporter;
370  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, false, label,importParams1);
371 
372  // We will also need local access to all rows of B that correspond to the
373  // column-map of op(A).
374  if (numProcs > 1)
375  targetMap_B = Aprime->getColMap();
376 
377  // Now import any needed remote rows and populate the Bview struct.
378  // Enable globalConstants by default
379  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
380  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
381  if(!params.is_null()) importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
382  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label,importParams2);
383 
384 #ifdef HAVE_TPETRA_MMM_TIMINGS
385  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply"))));
386 #endif
387 
388  // Now call the appropriate method to perform the actual multiplication.
389  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
390 
391  // Is this a "clean" matrix
392  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
393 
394  if (call_FillComplete_on_result && newFlag) {
395  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
396 
397  } else if (call_FillComplete_on_result) {
398  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
399 
400  } else {
401  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
402  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
403  // commenting it out.
404 // #ifdef HAVE_TPETRA_MMM_TIMINGS
405 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
406 // #endif
407  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
408  // commenting it out.
409  // if (call_FillComplete_on_result) {
410  // //We'll call FillComplete on the C matrix before we exit, and give
411  // //it a domain-map and a range-map.
412  // //The domain-map will be the domain-map of B, unless
413  // //op(B)==transpose(B), in which case the range-map of B will be used.
414  // //The range-map will be the range-map of A, unless
415  // //op(A)==transpose(A), in which case the domain-map of A will be used.
416  // if (!C.isFillComplete()) {
417  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
418  // }
419  // }
420  }
421 }
422 
423 
424 template <class Scalar,
425  class LocalOrdinal,
426  class GlobalOrdinal,
427  class Node>
428 void Add(
430  bool transposeA,
431  Scalar scalarA,
433  Scalar scalarB )
434 {
435  using Teuchos::Array;
436  using Teuchos::RCP;
437  using Teuchos::null;
438  typedef Scalar SC;
439  typedef LocalOrdinal LO;
440  typedef GlobalOrdinal GO;
441  typedef Node NO;
442  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
443  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
444 
445  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
446 
447  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
448  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
449  "(Result matrix B is not required to be isFillComplete()).");
450  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
451  prefix << "ERROR, input matrix B must not be fill complete!");
452  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
453  prefix << "ERROR, input matrix B must not have static graph!");
454  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
455  prefix << "ERROR, input matrix B must not be locally indexed!");
456  TEUCHOS_TEST_FOR_EXCEPTION(B.getProfileType()!=DynamicProfile, std::runtime_error,
457  prefix << "ERROR, input matrix B must have a dynamic profile!");
458 
459 
460  RCP<const crs_matrix_type> Aprime = null;
461  if (transposeA) {
462  transposer_type transposer(rcpFromRef (A));
463  Aprime = transposer.createTranspose();
464  } else {
465  Aprime = rcpFromRef(A);
466  }
467 
468  size_t a_numEntries;
469  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
470  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
471  GO row;
472 
473  if (scalarB != Teuchos::ScalarTraits<SC>::one())
474  B.scale(scalarB);
475 
476  bool bFilled = B.isFillComplete();
477  size_t numMyRows = B.getNodeNumRows();
478  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
479  for (LO i = 0; (size_t)i < numMyRows; ++i) {
480  row = B.getRowMap()->getGlobalElement(i);
481  Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
482 
483  if (scalarA != Teuchos::ScalarTraits<SC>::one())
484  for (size_t j = 0; j < a_numEntries; ++j)
485  a_vals[j] *= scalarA;
486 
487  if (bFilled)
488  B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
489  else
490  B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
491  }
492  }
493 }
494 
495 namespace ColMapFunctors
496 {
497  template<typename ByteView, typename GView>
498  struct UnionEntries
499  {
500  UnionEntries(ByteView entryUnion_, const GView gids_) : entryUnion(entryUnion_), gids(gids_) {}
501  KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const
502  {
503  entryUnion(gids(i)) = 1;
504  }
505  ByteView entryUnion;
506  const GView gids;
507  };
508 
509  template<typename LView, typename GView>
510  struct ConvertGlobalToLocal
511  {
512  ConvertGlobalToLocal(const LView gtol_, const GView gids_, LView lids_) : gtol(gtol_), gids(gids_), lids(lids_) {}
513  KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const
514  {
515  lids(i) = gtol(gids(i));
516  }
517  const LView gtol;
518  const GView gids;
519  LView lids;
520  };
521 }//end ColMapFunctors
522 
523 //Build the minimal (sorted) column map for the given set of global columns
524 //Then convert gids and store them in lids (gids is not modified)
525 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
526 Teuchos::RCP<Map<LocalOrdinal, GlobalOrdinal, Node> > AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
527 makeColMapAndConvertGids(GlobalOrdinal ncols,
528  const typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::global_col_inds_array& gids,
529  typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::col_inds_array& lids,
530  const Teuchos::RCP<const Teuchos::Comm<int>>& comm)
531 {
532  using namespace ColMapFunctors;
533  using Teuchos::RCP;
534  using Teuchos::rcp;
535  typedef Kokkos::View<char*, device_type> ByteView;
536  typedef global_col_inds_array GView;
537  typedef col_inds_array LView;
538  //Functors (explained in the procedural code below)
539  auto nentries = gids.extent(0);
540  //each entry of entryUnion is 0 unless there is a local entry in that column (then it is 1)
541  ByteView entryUnion("entry union", ncols);
542  UnionEntries<ByteView, GView> ue(entryUnion, gids);
543  Kokkos::parallel_for("Tpetra_MatrixMatrix_unionEntries", range_type(0, nentries), ue);
544  //turn entryUnion into prefix sum gtol (where gtol(i) gives the new local col for global col i)
545  LView gtol("global col -> local col", ncols + 1);
546  ::Tpetra::Details::computeOffsetsFromCounts<decltype(gtol), decltype(entryUnion)>(gtol, entryUnion);
547  //convert gids to local ids and put them in lids (implicitly sorted as long as input gids is sorted per row)
548  ConvertGlobalToLocal<LView, GView> cgtl(gtol, gids, lids);
549  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertGlobalToLocal", range_type(0, gids.extent(0)), cgtl);
550  //build local set of GIDs for constructing column map - the last entry in gtol is the total number of local cols
551  execution_space::fence();
552  GView colmap("column map", gtol(ncols));
553  size_t localIter = 0;
554  execution_space::fence();
555  for(size_t i = 0; i < entryUnion.extent(0); i++)
556  {
557  if(entryUnion(i) != 0)
558  {
559  colmap(localIter++) = i;
560  }
561  }
562  execution_space::fence();
563  //finally, construct Tpetra map
564  return rcp(new map_type(Teuchos::OrdinalTraits<GlobalOrdinal>::invalid(), colmap, 0, comm));
565 }
566 
567 template <class Scalar,
568  class LocalOrdinal,
569  class GlobalOrdinal,
570  class Node>
571 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
572 add (const Scalar& alpha,
573  const bool transposeA,
575  const Scalar& beta,
576  const bool transposeB,
578  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
579  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
580  const Teuchos::RCP<Teuchos::ParameterList>& params)
581 {
582  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
584  Teuchos::RCP<const map_type> CrowMap = transposeB ? B.getDomainMap() : B.getRowMap();
585 
586  Teuchos::RCP<crs_matrix_type> C = rcp(new crs_matrix_type(CrowMap, 0));
587 
588  add(alpha,transposeA,A,beta,transposeB,B,*C,domainMap,rangeMap,params);
589  return C;
590 }
591 
592 template <class Scalar,
593  class LocalOrdinal,
594  class GlobalOrdinal,
595  class Node>
596 void
597 add (const Scalar& alpha,
598  const bool transposeA,
600  const Scalar& beta,
601  const bool transposeB,
604  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
605  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
606  const Teuchos::RCP<Teuchos::ParameterList>& params)
607 {
608  using Teuchos::RCP;
609  using Teuchos::rcp;
610  using Teuchos::rcpFromRef;
611  using Teuchos::rcp_implicit_cast;
612  using Teuchos::rcp_dynamic_cast;
613  using Teuchos::TimeMonitor;
614  typedef Scalar SC;
615  typedef LocalOrdinal LO;
616  typedef GlobalOrdinal GO;
617  typedef Node NO;
618  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
619  typedef Map<LO,GO,NO> map_type;
620  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
621  typedef Import<LO,GO,NO> import_type;
622  typedef Export<LO,GO,NO> export_type;
623  typedef AddDetails::AddKernels<SC,LO,GO,NO> AddKern;
624  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
625  constexpr bool debug = false;
626 
627 #ifdef HAVE_TPETRA_MMM_TIMINGS
628  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
629 #endif
630 
631  if (debug) {
632  std::ostringstream os;
633  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
634  << "TpetraExt::MatrixMatrix::add" << std::endl;
635  std::cerr << os.str ();
636  }
637 
638  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete () || !B.isFillComplete (), std::invalid_argument,
639  prefix_mmm << "A and B must both be fill complete.");
640 #ifdef HAVE_TPETRA_DEBUG
641  // The matrices don't have domain or range Maps unless they are fill complete.
642  if (A.isFillComplete () && B.isFillComplete ()) {
643  const bool domainMapsSame =
644  (!transposeA && !transposeB && !A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
645  (!transposeA && transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
646  ( transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
647  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
648  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
649 
650  const bool rangeMapsSame =
651  (!transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
652  (!transposeA && transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
653  ( transposeA && !transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ()));
654  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
655  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
656  }
657 #endif // HAVE_TPETRA_DEBUG
658  auto comm = A.getComm();
659  // Form the explicit transpose of A if necessary.
660  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
661  if (transposeA) {
662  transposer_type transposer(Aprime);
663  Aprime = transposer.createTranspose ();
664  }
665 #ifdef HAVE_TPETRA_DEBUG
666  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
667  prefix_mmm << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
668 #endif // HAVE_TPETRA_DEBUG
669  // Form the explicit transpose of B if necessary.
670  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
671  if (transposeB) {
672  if (debug) {
673  std::ostringstream os;
674  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
675  << "Form explicit xpose of B" << std::endl;
676  std::cerr << os.str ();
677  }
678  transposer_type transposer(Bprime);
679  Bprime = transposer.createTranspose ();
680  }
681 #ifdef HAVE_TPETRA_DEBUG
682  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
683  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
684  TEUCHOS_TEST_FOR_EXCEPTION(
685  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
686  prefix_mmm << "Aprime and Bprime must both be fill complete. "
687  "Please report this bug to the Tpetra developers.");
688 #endif // HAVE_TPETRA_DEBUG
689  RCP<const map_type> CDomainMap = domainMap;
690  RCP<const map_type> CRangeMap = rangeMap;
691  if(CDomainMap.is_null())
692  {
693  CDomainMap = Bprime->getDomainMap();
694  }
695  if(CRangeMap.is_null())
696  {
697  CRangeMap = Bprime->getRangeMap();
698  }
699  assert(!(CDomainMap.is_null()));
700  assert(!(CRangeMap.is_null()));
701  typedef typename AddKern::values_array values_array;
702  typedef typename AddKern::row_ptrs_array row_ptrs_array;
703  typedef typename AddKern::col_inds_array col_inds_array;
704  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
705  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
706  values_array vals;
707  row_ptrs_array rowptrs;
708  col_inds_array colinds;
709  auto acolmap = Aprime->getColMap()->getMyGlobalIndices();
710  auto bcolmap = Bprime->getColMap()->getMyGlobalIndices();
711 #ifdef HAVE_TPETRA_MMM_TIMINGS
712  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
713 #endif
714  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
715  {
716  //import Aprime into Bprime's row map so the local matrices have same # of rows
717  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
718  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
719  }
720  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
721  bool sorted = AGraphSorted && BGraphSorted;
722  RCP<const map_type> CrowMap;
723  RCP<const map_type> CcolMap;
724  RCP<const import_type> Cimport = Teuchos::null;
725  RCP<export_type> Cexport = Teuchos::null;
726  //The unsorted KCRS addition kernel uses std::sort(), which can't run on CUDA
727  if(!matchingColMaps && !(CDomainMap->isContiguous()))
728  {
729  //can't do with current set of kernels, so fall back to original (slow) version
730 #ifdef HAVE_TPETRA_MMM_TIMINGS
731  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("fallback to CrsMatrix::add"))));
732 #endif
733  if (debug) {
734  std::ostringstream os;
735  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
736  << "Call Bprime->add(...)" << std::endl;
737  std::cerr << os.str ();
738  }
739  Teuchos::RCP<crs_matrix_type> C_ = Teuchos::rcp_static_cast<crs_matrix_type>(Bprime->add(alpha, *Aprime, beta, CDomainMap, CRangeMap, params));
740  C.replaceColMap(C_->getColMap());
741  C.setAllValues(C_->getLocalMatrix().graph.row_map,C_->getLocalMatrix().graph.entries,C_->getLocalMatrix().values);
742  C.expertStaticFillComplete(CDomainMap, CRangeMap, C_->getGraph()->getImporter(), C_->getGraph()->getExporter(), params);
743  return;
744  }
745  else if(!matchingColMaps)
746  {
747 #ifdef HAVE_TPETRA_MMM_TIMINGS
748  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
749 #endif
750  //use kernel that converts col indices in both A and B to common domain map before adding
751  auto Acolmap = Aprime->getColMap()->getMyGlobalIndices();
752  auto Bcolmap = Bprime->getColMap()->getMyGlobalIndices();
753  typename AddKern::global_col_inds_array globalColinds("", 0);
754  if (debug) {
755  std::ostringstream os;
756  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
757  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
758  std::cerr << os.str ();
759  }
760  AddKern::convertToGlobalAndAdd(
761  Aprime->getLocalMatrix(), alpha, Bprime->getLocalMatrix(), beta, Acolmap, Bcolmap,
762  CRangeMap->getMinGlobalIndex(), Aprime->getGlobalNumCols(), vals, rowptrs, globalColinds);
763  colinds = col_inds_array("C colinds", globalColinds.extent(0));
764  if (debug) {
765  std::ostringstream os;
766  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
767  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
768  std::cerr << os.str ();
769  }
770 #ifdef HAVE_TPETRA_MMM_TIMINGS
771  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("building optimized column map"))));
772 #endif
773  CrowMap = Bprime->getRowMap();
774  //OLD: CcolMap = AddKern::makeColMapAndConvertGids(Aprime->getGlobalNumCols(), globalColinds, colinds, comm);
775  //Get C's column map as the union of Aprime and Bprime col maps
776  CcolMap = AddKern::makeColMapAndConvertGids(Aprime->getGlobalNumCols(), globalColinds, colinds, comm);
777  }
778  else
779  {
780  //Aprime, Bprime and C all have the same column maps
781  auto localA = Aprime->getLocalMatrix();
782  auto localB = Bprime->getLocalMatrix();
783  auto Avals = localA.values;
784  auto Bvals = localB.values;
785  auto Arowptrs = localA.graph.row_map;
786  auto Browptrs = localB.graph.row_map;
787  auto Acolinds = localA.graph.entries;
788  auto Bcolinds = localB.graph.entries;
789  if(sorted)
790  {
791  //use sorted kernel
792 #ifdef HAVE_TPETRA_MMM_TIMINGS
793  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
794 #endif
795  if (debug) {
796  std::ostringstream os;
797  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
798  << "Call AddKern::addSorted(...)" << std::endl;
799  std::cerr << os.str ();
800  }
801  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
802  }
803  else
804  {
805  //use unsorted kernel
806 #ifdef HAVE_TPETRA_MMM_TIMINGS
807  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
808 #endif
809  if (debug) {
810  std::ostringstream os;
811  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
812  << "Call AddKern::addUnsorted(...)" << std::endl;
813  std::cerr << os.str ();
814  }
815  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
816  }
817  CrowMap = Bprime->getRowMap();
818  CcolMap = Bprime->getColMap();
819  //note: Cexport created below (if it's needed)
820  }
821 #ifdef HAVE_TPETRA_MMM_TIMINGS
822  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs constructor"))));
823 #endif
824  // C = rcp(new crs_matrix_type(CrowMap, CcolMap, rowptrs, colinds, vals, params));
825  C.replaceColMap(CcolMap);
826  C.setAllValues(rowptrs,colinds,vals);
827 #ifdef HAVE_TPETRA_MMM_TIMINGS
828  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
829 #endif
830  if(!CDomainMap->isSameAs(*CcolMap))
831  {
832  if (debug) {
833  std::ostringstream os;
834  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
835  << "Create Cimport" << std::endl;
836  std::cerr << os.str ();
837  }
838  Cimport = rcp(new import_type(CDomainMap, CcolMap));
839  }
840  if(!CrowMap->isSameAs(*CRangeMap))
841  {
842  if (debug) {
843  std::ostringstream os;
844  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
845  << "Create Cexport" << std::endl;
846  std::cerr << os.str ();
847  }
848  Cexport = rcp(new export_type(CrowMap, CRangeMap));
849  }
850 
851  if (debug) {
852  std::ostringstream os;
853  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
854  << "Call C->expertStaticFillComplete(...)" << std::endl;
855  std::cerr << os.str ();
856  }
857  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
858 
859 }
860 
861 template <class Scalar,
862  class LocalOrdinal,
863  class GlobalOrdinal,
864  class Node>
865 void Add(
867  bool transposeA,
868  Scalar scalarA,
870  bool transposeB,
871  Scalar scalarB,
873 {
874  using Teuchos::Array;
875  using Teuchos::ArrayRCP;
876  using Teuchos::ArrayView;
877  using Teuchos::RCP;
878  using Teuchos::rcp;
879  using Teuchos::rcp_dynamic_cast;
880  using Teuchos::rcpFromRef;
881  using Teuchos::tuple;
882  using std::endl;
883  // typedef typename ArrayView<const Scalar>::size_type size_type;
884  typedef Teuchos::ScalarTraits<Scalar> STS;
886  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
887  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
888  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
891 
892  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
893 
894  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
895  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
896 
897  TEUCHOS_TEST_FOR_EXCEPTION(
898  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
899  prefix << "Both input matrices must be fill complete before calling this function.");
900 
901 #ifdef HAVE_TPETRA_DEBUG
902  {
903  const bool domainMapsSame =
904  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
905  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
906  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
907  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
908  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
909 
910  const bool rangeMapsSame =
911  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
912  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
913  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
914  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
915  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
916  }
917 #endif // HAVE_TPETRA_DEBUG
918 
919  // Form the explicit transpose of A if necessary.
920  RCP<const crs_matrix_type> Aprime;
921  if (transposeA) {
922  transposer_type theTransposer (rcpFromRef (A));
923  Aprime = theTransposer.createTranspose ();
924  } else {
925  Aprime = rcpFromRef (A);
926  }
927 
928 #ifdef HAVE_TPETRA_DEBUG
929  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
930  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
931 #endif // HAVE_TPETRA_DEBUG
932 
933  // Form the explicit transpose of B if necessary.
934  RCP<const crs_matrix_type> Bprime;
935  if (transposeB) {
936  transposer_type theTransposer (rcpFromRef (B));
937  Bprime = theTransposer.createTranspose ();
938  } else {
939  Bprime = rcpFromRef (B);
940  }
941 
942 #ifdef HAVE_TPETRA_DEBUG
943  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
944  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
945 #endif // HAVE_TPETRA_DEBUG
946 
947  // Allocate or zero the entries of the result matrix.
948  if (! C.is_null ()) {
949  C->setAllToScalar (STS::zero ());
950  } else {
951 #if 0
952  // If Aprime and Bprime have the same row Map, and if C is null,
953  // we can optimize construction and fillComplete of C. For now,
954  // we just check pointer equality, to avoid the all-reduce in
955  // isSameAs. It may be worth that all-reduce to check, however.
956  //if (Aprime->getRowMap ().getRawPtr () == Bprime->getRowMap ().getRawPtr ())
957  if (Aprime->getRowMap ()->isSameAs (* (Bprime->getRowMap ())) {
958  RCP<const map_type> rowMap = Aprime->getRowMap ();
959 
960  RCP<const crs_graph_type> A_graph =
961  rcp_dynamic_cast<const crs_graph_type> (Aprime->getGraph (), true);
962 #ifdef HAVE_TPETRA_DEBUG
963  TEUCHOS_TEST_FOR_EXCEPTION(A_graph.is_null (), std::logic_error,
964  "Tpetra::MatrixMatrix::Add: Graph of Op(A) is null. "
965  "Please report this bug to the Tpetra developers.");
966 #endif // HAVE_TPETRA_DEBUG
967  RCP<const crs_graph_type> B_graph =
968  rcp_dynamic_cast<const crs_graph_type> (Bprime->getGraph (), true);
969 #ifdef HAVE_TPETRA_DEBUG
970  TEUCHOS_TEST_FOR_EXCEPTION(B_graph.is_null (), std::logic_error,
971  "Tpetra::MatrixMatrix::Add: Graph of Op(B) is null. "
972  "Please report this bug to the Tpetra developers.");
973 #endif // HAVE_TPETRA_DEBUG
974  RCP<const map_type> A_colMap = A_graph->getColMap ();
975 #ifdef HAVE_TPETRA_DEBUG
976  TEUCHOS_TEST_FOR_EXCEPTION(A_colMap.is_null (), std::logic_error,
977  "Tpetra::MatrixMatrix::Add: Column Map of Op(A) is null. "
978  "Please report this bug to the Tpetra developers.");
979 #endif // HAVE_TPETRA_DEBUG
980  RCP<const map_type> B_colMap = B_graph->getColMap ();
981 #ifdef HAVE_TPETRA_DEBUG
982  TEUCHOS_TEST_FOR_EXCEPTION(B_colMap.is_null (), std::logic_error,
983  "Tpetra::MatrixMatrix::Add: Column Map of Op(B) is null. "
984  "Please report this bug to the Tpetra developers.");
985  TEUCHOS_TEST_FOR_EXCEPTION(A_graph->getImporter ().is_null (),
986  std::logic_error,
987  "Tpetra::MatrixMatrix::Add: Op(A)'s Import is null. "
988  "Please report this bug to the Tpetra developers.");
989  TEUCHOS_TEST_FOR_EXCEPTION(B_graph->getImporter ().is_null (),
990  std::logic_error,
991  "Tpetra::MatrixMatrix::Add: Op(B)'s Import is null. "
992  "Please report this bug to the Tpetra developers.");
993 #endif // HAVE_TPETRA_DEBUG
994 
995  // Compute the (column Map and) Import of the matrix sum.
996  RCP<const import_type> sumImport =
997  A_graph->getImporter ()->setUnion (* (B_graph->getImporter ()));
998  RCP<const map_type> C_colMap = sumImport->getTargetMap ();
999 
1000  // First, count the number of entries in each row. Then, go
1001  // back over the rows again, and compute the actual sum.
1002  // Remember that C may have a different column Map than Aprime
1003  // or Bprime, so its local indices may be different. That's why
1004  // we have to convert from local to global indices.
1005 
1006  ArrayView<const LocalOrdinal> A_local_ind;
1007  Array<GlobalOrdinal> A_global_ind;
1008  ArrayView<const LocalOrdinal> B_local_ind;
1009  Array<GlobalOrdinal> B_global_ind;
1010 
1011  const size_t localNumRows = rowMap->getNodeNumElements ();
1012  ArrayRCP<size_t> numEntriesPerRow (localNumRows);
1013  // Compute the max number of entries in any row of A+B on this
1014  // process, so that we won't have to resize temporary arrays.
1015  size_t maxNumEntriesPerRow = 0;
1016  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
1017  // Get view of current row of A_graph, in its local indices.
1018  A_graph->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind);
1019  const size_type A_numEnt = A_local_ind.size ();
1020  if (A_numEnt > A_global_ind.size ()) {
1021  A_global_ind.resize (A_numEnt);
1022  }
1023  // Convert A's local indices to global indices.
1024  for (size_type k = 0; k < A_numEnt; ++k) {
1025  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1026  }
1027 
1028  // Get view of current row of B_graph, in its local indices.
1029  B_graph->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind);
1030  const size_type B_numEnt = B_local_ind.size ();
1031  if (B_numEnt > B_global_ind.size ()) {
1032  B_global_ind.resize (B_numEnt);
1033  }
1034  // Convert B's local indices to global indices.
1035  for (size_type k = 0; k < B_numEnt; ++k) {
1036  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1037  }
1038 
1039  // Count the number of entries in the merged row of A + B.
1040  const size_t curNumEntriesPerRow =
1041  keyMergeCount (A_global_ind.begin (), A_global_ind.end (),
1042  B_global_ind.begin (), B_global_ind.end ());
1043  numEntriesPerRow[localRow] = curNumEntriesPerRow;
1044  maxNumEntriesPerRow = std::max (maxNumEntriesPerRow, curNumEntriesPerRow);
1045  }
1046 
1047  // Create C, using the sum column Map and number of entries per
1048  // row that we computed above. Having the exact number of
1049  // entries per row lets us use static profile, making it valid
1050  // to call expertStaticFillComplete.
1051  C = rcp (new crs_matrix_type (rowMap, C_colMap, numEntriesPerRow, StaticProfile));
1052 
1053  // Go back through the rows and actually compute the sum. We
1054  // don't ever have to resize A_global_ind or B_global_ind below,
1055  // since we've already done it above.
1056  ArrayView<const Scalar> A_val;
1057  ArrayView<const Scalar> B_val;
1058 
1059  Array<LocalOrdinal> AplusB_local_ind (maxNumEntriesPerRow);
1060  Array<GlobalOrdinal> AplusB_global_ind (maxNumEntriesPerRow);
1061  Array<Scalar> AplusB_val (maxNumEntriesPerRow);
1062 
1063  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
1064  // Get view of current row of A, in A's local indices.
1065  Aprime->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind, A_val);
1066  // Convert A's local indices to global indices.
1067  for (size_type k = 0; k < A_local_ind.size (); ++k) {
1068  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1069  }
1070 
1071  // Get view of current row of B, in B's local indices.
1072  Bprime->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind, B_val);
1073  // Convert B's local indices to global indices.
1074  for (size_type k = 0; k < B_local_ind.size (); ++k) {
1075  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1076  }
1077 
1078  const size_t curNumEntries = numEntriesPerRow[localRow];
1079  ArrayView<LocalOrdinal> C_local_ind = AplusB_local_ind (0, curNumEntries);
1080  ArrayView<GlobalOrdinal> C_global_ind = AplusB_global_ind (0, curNumEntries);
1081  ArrayView<Scalar> C_val = AplusB_val (0, curNumEntries);
1082 
1083  // Sum the entries in the current row of A plus B.
1084  keyValueMerge (A_global_ind.begin (), A_global_ind.end (),
1085  A_val.begin (), A_val.end (),
1086  B_global_ind.begin (), B_global_ind.end (),
1087  B_val.begin (), B_val.end (),
1088  C_global_ind.begin (), C_val.begin (),
1089  std::plus<Scalar> ());
1090  // Convert the sum's global indices into C's local indices.
1091  for (size_type k = 0; k < as<size_type> (numEntriesPerRow[localRow]); ++k) {
1092  C_local_ind[k] = C_colMap->getLocalElement (C_global_ind[k]);
1093  }
1094  // Give the current row sum to C.
1095  C->replaceLocalValues (localRow, C_local_ind, C_val);
1096  }
1097 
1098  // Use "expert static fill complete" to bypass construction of
1099  // the Import and Export (if applicable) object(s).
1100  RCP<const map_type> domainMap = A_graph->getDomainMap ();
1101  RCP<const map_type> rangeMap = A_graph->getRangeMap ();
1102  C->expertStaticFillComplete (domainMap, rangeMap, sumImport, A_graph->getExporter ());
1103 
1104  return; // Now we're done!
1105  }
1106  else {
1107  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1108  // noticed that C was being given the row Map of Aprime (the
1109  // possibly transposed version of A). Is this what we want?
1110  C = rcp (new crs_matrix_type (Aprime->getRowMap (), null));
1111  }
1112 
1113 #else
1114  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1115  // noticed that C was being given the row Map of Aprime (the
1116  // possibly transposed version of A). Is this what we want?
1117  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1118 #endif // 0
1119  }
1120 
1121 #ifdef HAVE_TPETRA_DEBUG
1122  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1123  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1124  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1125  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1126  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1127  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1128 #endif // HAVE_TPETRA_DEBUG
1129 
1130  Array<RCP<const crs_matrix_type> > Mat =
1131  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1132  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1133 
1134  // do a loop over each matrix to add: A reordering might be more efficient
1135  for (int k = 0; k < 2; ++k) {
1136  Array<GlobalOrdinal> Indices;
1137  Array<Scalar> Values;
1138 
1139  // Loop over each locally owned row of the current matrix (either
1140  // Aprime or Bprime), and sum its entries into the corresponding
1141  // row of C. This works regardless of whether Aprime or Bprime
1142  // has the same row Map as C, because both sumIntoGlobalValues and
1143  // insertGlobalValues allow summing resp. inserting into nonowned
1144  // rows of C.
1145 #ifdef HAVE_TPETRA_DEBUG
1146  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1147  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1148 #endif // HAVE_TPETRA_DEBUG
1149  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1150 #ifdef HAVE_TPETRA_DEBUG
1151  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1152  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1153 #endif // HAVE_TPETRA_DEBUG
1154 
1155  const size_t localNumRows = Mat[k]->getNodeNumRows ();
1156  for (size_t i = 0; i < localNumRows; ++i) {
1157  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1158  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1159  if (numEntries > 0) {
1160  Indices.resize (numEntries);
1161  Values.resize (numEntries);
1162  Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1163 
1164  if (scalar[k] != STS::one ()) {
1165  for (size_t j = 0; j < numEntries; ++j) {
1166  Values[j] *= scalar[k];
1167  }
1168  }
1169 
1170  if (C->isFillComplete ()) {
1171  C->sumIntoGlobalValues (globalRow, Indices, Values);
1172  } else {
1173  C->insertGlobalValues (globalRow, Indices, Values);
1174  }
1175  }
1176  }
1177  }
1178 }
1179 
1180 template<typename SC, typename LO, typename GO, typename NO>
1181 void AddDetails::AddKernels<SC, LO, GO, NO>::
1182 addSorted(
1183  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1184  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1185  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1186  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1187  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1188  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1189  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1190  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1191  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1192  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1193  typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1194 {
1195  using Teuchos::TimeMonitor;
1196  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
1197  auto nrows = Arowptrs.extent(0) - 1;
1198  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1199  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, impl_scalar_type,
1200  execution_space, memory_space, memory_space> KKH;
1201  KKH handle;
1202  handle.create_spadd_handle(true);
1203  auto addHandle = handle.get_spadd_handle();
1204 #ifdef HAVE_TPETRA_MMM_TIMINGS
1205  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
1206 #endif
1207  KokkosSparse::Experimental::spadd_symbolic
1208  <KKH,
1209  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1210  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1211  row_ptrs_array, col_inds_array>
1212  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1213  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1214  Ccolinds = col_inds_array("C colinds", addHandle->get_max_result_nnz());
1215 #ifdef HAVE_TPETRA_MMM_TIMINGS
1216  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
1217 #endif
1218  KokkosSparse::Experimental::spadd_numeric(&handle,
1219  Arowptrs, Acolinds, Avals, scalarA,
1220  Browptrs, Bcolinds, Bvals, scalarB,
1221  Crowptrs, Ccolinds, Cvals);
1222 }
1223 
1224 template<typename SC, typename LO, typename GO, typename NO>
1225 void AddDetails::AddKernels<SC, LO, GO, NO>::
1226 addUnsorted(
1227  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1228  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1229  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1230  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1231  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1232  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1233  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1234  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1235  GO numGlobalCols,
1236  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1237  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1238  typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1239 {
1240  using Teuchos::TimeMonitor;
1241  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
1242  auto nrows = Arowptrs.extent(0) - 1;
1243  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1244  typedef AddDetails::AddKernels<SC, LO, GO, NO> AddKern;
1245  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
1246  AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
1247  KKH handle;
1248  handle.create_spadd_handle(false);
1249  auto addHandle = handle.get_spadd_handle();
1250 #ifdef HAVE_TPETRA_MMM_TIMINGS
1251  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
1252 #endif
1253  KokkosSparse::Experimental::spadd_symbolic
1254  <KKH,
1255  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1256  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1257  row_ptrs_array, col_inds_array>
1258  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1259  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1260  Ccolinds = col_inds_array("C colinds", addHandle->get_max_result_nnz());
1261 #ifdef HAVE_TPETRA_MMM_TIMINGS
1262  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
1263 #endif
1264  KokkosSparse::Experimental::spadd_numeric(&handle,
1265  Arowptrs, Acolinds, Avals, scalarA,
1266  Browptrs, Bcolinds, Bvals, scalarB,
1267  Crowptrs, Ccolinds, Cvals);
1268 }
1269 
1270 template<typename GO,
1271  typename LocalIndicesType,
1272  typename GlobalIndicesType,
1273  typename ColMapType>
1274 struct ConvertColIndsFunctor
1275 {
1276  ConvertColIndsFunctor (const GO minGlobal_,
1277  const LocalIndicesType& colindsOrig_,
1278  const GlobalIndicesType& colindsConverted_,
1279  const ColMapType& colmap_) :
1280  minGlobal (minGlobal_),
1281  colindsOrig (colindsOrig_),
1282  colindsConverted (colindsConverted_),
1283  colmap (colmap_)
1284  {}
1285  KOKKOS_INLINE_FUNCTION void
1286  operator() (const size_t& i) const
1287  {
1288  colindsConverted[i] = colmap[colindsOrig[i]];
1289  }
1290  GO minGlobal;
1291  LocalIndicesType colindsOrig;
1292  GlobalIndicesType colindsConverted;
1293  ColMapType colmap;
1294 };
1295 
1296 template<typename SC, typename LO, typename GO, typename NO>
1297 void AddDetails::AddKernels<SC, LO, GO, NO>::
1298 convertToGlobalAndAdd(
1299  const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
1300  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1301  const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
1302  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1303  const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
1304  const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
1305  GO minGlobalCol,
1306  GO numGlobalCols,
1307  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1308  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1309  typename AddDetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
1310 {
1311  using Teuchos::TimeMonitor;
1312 
1313  const values_array& Avals = A.values;
1314  const values_array& Bvals = B.values;
1315  const col_inds_array& Acolinds = A.graph.entries;
1316  const col_inds_array& Bcolinds = B.graph.entries;
1317  auto Arowptrs = A.graph.row_map;
1318  auto Browptrs = B.graph.row_map;
1319  global_col_inds_array AcolindsConverted("A colinds (converted)", Acolinds.extent(0));
1320  global_col_inds_array BcolindsConverted("B colinds (converted)", Bcolinds.extent(0));
1321 #ifdef HAVE_TPETRA_MMM_TIMINGS
1322  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
1323 #endif
1324  ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(minGlobalCol, Acolinds, AcolindsConverted, AcolMap);
1325  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
1326  ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(minGlobalCol, Bcolinds, BcolindsConverted, BcolMap);
1327  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
1328  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, GO, impl_scalar_type,
1329  execution_space, memory_space, memory_space> KKH;
1330  KKH handle;
1331  handle.create_spadd_handle(false);
1332  auto addHandle = handle.get_spadd_handle();
1333 #ifdef HAVE_TPETRA_MMM_TIMINGS
1334  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
1335 #endif
1336  auto nrows = Arowptrs.extent(0) - 1;
1337  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1338  KokkosSparse::Experimental::spadd_symbolic
1339  <KKH, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
1340  (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
1341  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1342  Ccolinds = global_col_inds_array("C colinds", addHandle->get_max_result_nnz());
1343 #ifdef HAVE_TPETRA_MMM_TIMINGS
1344  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
1345 #endif
1346  KokkosSparse::Experimental::spadd_numeric(&handle,
1347  Arowptrs, AcolindsConverted, Avals, scalarA,
1348  Browptrs, BcolindsConverted, Bvals, scalarB,
1349  Crowptrs, Ccolinds, Cvals);
1350 }
1351 
1352 } //End namespace MatrixMatrix
1353 
1354 namespace MMdetails{
1355 
1356 /*********************************************************************************************************/
1357 // Prints MMM-style statistics on communication done with an Import or Export object
1358 template <class TransferType>
1359 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1360  if (Transfer.is_null())
1361  return;
1362 
1363  const Distributor & Distor = Transfer->getDistributor();
1364  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1365 
1366  size_t rows_send = Transfer->getNumExportIDs();
1367  size_t rows_recv = Transfer->getNumRemoteIDs();
1368 
1369  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1370  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1371  size_t num_send_neighbors = Distor.getNumSends();
1372  size_t num_recv_neighbors = Distor.getNumReceives();
1373  size_t round2_send, round2_recv;
1374  Distor.getLastDoStatistics(round2_send,round2_recv);
1375 
1376  int myPID = Comm->getRank();
1377  int NumProcs = Comm->getSize();
1378 
1379  // Processor by processor statistics
1380  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1381  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1382 
1383  // Global statistics
1384  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1385  size_t gstats_min[8], gstats_max[8];
1386 
1387  double lstats_avg[8], gstats_avg[8];
1388  for(int i=0; i<8; i++)
1389  lstats_avg[i] = ((double)lstats[i])/NumProcs;
1390 
1391  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1392  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1393  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1394 
1395  if(!myPID) {
1396  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1397  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1398  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1399  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1400  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1401  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1402  }
1403 }
1404 
1405 
1406 /*********************************************************************************************************/
1407 // Kernel method for computing the local portion of C = A*B
1408 template<class Scalar,
1409  class LocalOrdinal,
1410  class GlobalOrdinal,
1411  class Node>
1412 void mult_AT_B_newmatrix(
1413  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1414  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1415  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1416  const std::string & label,
1417  const Teuchos::RCP<Teuchos::ParameterList>& params)
1418 {
1419  // Using & Typedefs
1420  using Teuchos::RCP;
1421  using Teuchos::rcp;
1422  typedef Scalar SC;
1423  typedef LocalOrdinal LO;
1424  typedef GlobalOrdinal GO;
1425  typedef Node NO;
1426  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1427  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1428 
1429 #ifdef HAVE_TPETRA_MMM_TIMINGS
1430  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1431  using Teuchos::TimeMonitor;
1432  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T Transpose"))));
1433 #endif
1434 
1435  /*************************************************************/
1436  /* 1) Local Transpose of A */
1437  /*************************************************************/
1438  transposer_type transposer (rcpFromRef (A),label+std::string("XP: "));
1439  RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1440  if(!params.is_null()) transposeParams->set("compute global constants",params->get("compute global constants: temporaries",false));
1441  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Atrans = transposer.createTransposeLocal(transposeParams);
1442 
1443  /*************************************************************/
1444  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1445  /*************************************************************/
1446 #ifdef HAVE_TPETRA_MMM_TIMINGS
1447  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T I&X"))));
1448 #endif
1449 
1450  // Get views, asserting that no import is required to speed up computation
1451  crs_matrix_struct_type Aview;
1452  crs_matrix_struct_type Bview;
1453  RCP<const Import<LocalOrdinal,GlobalOrdinal, Node> > dummyImporter;
1454 
1455  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1456  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
1457  if(!params.is_null()) importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
1458  MMdetails::import_and_extract_views(*Atrans, Atrans->getRowMap(), Aview, dummyImporter,true, label,importParams1);
1459 
1460  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
1461  if(!params.is_null()) importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
1462 
1463  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1464  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1465  }
1466  else {
1467  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1468  }
1469 
1470 #ifdef HAVE_TPETRA_MMM_TIMINGS
1471  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1472 #endif
1473 
1474  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >Ctemp;
1475 
1476  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1477  bool needs_final_export = !Atrans->getGraph()->getExporter().is_null();
1478  if (needs_final_export)
1479  Ctemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Atrans->getRowMap(),0));
1480  else
1481  Ctemp = rcp(&C,false);// don't allow deallocation
1482 
1483  // Multiply
1484  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1485 
1486  /*************************************************************/
1487  /* 4) exportAndFillComplete matrix */
1488  /*************************************************************/
1489 #ifdef HAVE_TPETRA_MMM_TIMINGS
1490  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1491 #endif
1492 
1493  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Crcp(&C,false);
1494  if (needs_final_export) {
1495  Teuchos::ParameterList labelList;
1496  labelList.set("Timer Label", label);
1497  if(!params.is_null()) labelList.set("compute global constants",params->get("compute global constants",true));
1498 
1499  Ctemp->exportAndFillComplete(Crcp,*Ctemp->getGraph()->getExporter(),
1500  B.getDomainMap(),A.getDomainMap(),rcp(&labelList,false));
1501  }
1502 #ifdef HAVE_TPETRA_MMM_STATISTICS
1503  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1504 #endif
1505 }
1506 
1507 /*********************************************************************************************************/
1508 // Kernel method for computing the local portion of C = A*B
1509 template<class Scalar,
1510  class LocalOrdinal,
1511  class GlobalOrdinal,
1512  class Node>
1513 void mult_A_B(
1514  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1515  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1516  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1517  const std::string& label,
1518  const Teuchos::RCP<Teuchos::ParameterList>& params)
1519 {
1520  using Teuchos::Array;
1521  using Teuchos::ArrayRCP;
1522  using Teuchos::ArrayView;
1523  using Teuchos::OrdinalTraits;
1524  using Teuchos::null;
1525 
1526  typedef Teuchos::ScalarTraits<Scalar> STS;
1527  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1528  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1529  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1530 
1531  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1532  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1533 
1534  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1535  ArrayView<const GlobalOrdinal> bcols_import = null;
1536  if (Bview.importColMap != null) {
1537  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1538  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1539 
1540  bcols_import = Bview.importColMap->getNodeElementList();
1541  }
1542 
1543  size_t C_numCols = C_lastCol - C_firstCol +
1544  OrdinalTraits<LocalOrdinal>::one();
1545  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1546  OrdinalTraits<LocalOrdinal>::one();
1547 
1548  if (C_numCols_import > C_numCols)
1549  C_numCols = C_numCols_import;
1550 
1551  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1552  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1553  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1554 
1555  Array<Scalar> C_row_i = dwork;
1556  Array<GlobalOrdinal> C_cols = iwork;
1557  Array<size_t> c_index = iwork2;
1558  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1559  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1560 
1561  size_t C_row_i_length, j, k, last_index;
1562 
1563  // Run through all the hash table lookups once and for all
1564  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1565  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1566  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1567  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1568  // Maps are the same: Use local IDs as the hash
1569  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1570  Aview.colMap->getMaxLocalIndex(); i++)
1571  Acol2Brow[i]=i;
1572  }
1573  else {
1574  // Maps are not the same: Use the map's hash
1575  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1576  Aview.colMap->getMaxLocalIndex(); i++) {
1577  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1578  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1579  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1580  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1581  }
1582  }
1583 
1584  // To form C = A*B we're going to execute this expression:
1585  //
1586  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1587  //
1588  // Our goal, of course, is to navigate the data in A and B once, without
1589  // performing searches for column-indices, etc.
1590  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1591  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1592  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1593  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1594  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1595  ArrayView<const Scalar> Avals, Bvals, Ivals;
1596  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1597  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1598  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1599  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1600  if(!Bview.importMatrix.is_null()) {
1601  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1602  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1603  }
1604 
1605  bool C_filled = C.isFillComplete();
1606 
1607  for (size_t i = 0; i < C_numCols; i++)
1608  c_index[i] = OrdinalTraits<size_t>::invalid();
1609 
1610  // Loop over the rows of A.
1611  size_t Arows = Aview.rowMap->getNodeNumElements();
1612  for(size_t i=0; i<Arows; ++i) {
1613 
1614  // Only navigate the local portion of Aview... which is, thankfully, all of
1615  // A since this routine doesn't do transpose modes
1616  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1617 
1618  // Loop across the i-th row of A and for each corresponding row in B, loop
1619  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1620  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1621  // calculating updates for row i of the result matrix C.
1622  C_row_i_length = OrdinalTraits<size_t>::zero();
1623 
1624  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1625  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1626  const Scalar Aval = Avals[k];
1627  if (Aval == STS::zero())
1628  continue;
1629 
1630  if (Ak == LO_INVALID)
1631  continue;
1632 
1633  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1634  LocalOrdinal col = Bcolind[j];
1635  //assert(col >= 0 && col < C_numCols);
1636 
1637  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1638  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1639  // This has to be a += so insertGlobalValue goes out
1640  C_row_i[C_row_i_length] = Aval*Bvals[j];
1641  C_cols[C_row_i_length] = col;
1642  c_index[col] = C_row_i_length;
1643  C_row_i_length++;
1644 
1645  } else {
1646  C_row_i[c_index[col]] += Aval*Bvals[j];
1647  }
1648  }
1649  }
1650 
1651  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1652  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1653  C_cols[ii] = bcols[C_cols[ii]];
1654  combined_index[ii] = C_cols[ii];
1655  combined_values[ii] = C_row_i[ii];
1656  }
1657  last_index = C_row_i_length;
1658 
1659  //
1660  //Now put the C_row_i values into C.
1661  //
1662  // We might have to revamp this later.
1663  C_row_i_length = OrdinalTraits<size_t>::zero();
1664 
1665  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1666  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1667  const Scalar Aval = Avals[k];
1668  if (Aval == STS::zero())
1669  continue;
1670 
1671  if (Ak!=LO_INVALID) continue;
1672 
1673  Ak = Acol2Irow[Acolind[k]];
1674  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1675  LocalOrdinal col = Icolind[j];
1676  //assert(col >= 0 && col < C_numCols);
1677 
1678  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1679  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1680  // This has to be a += so insertGlobalValue goes out
1681  C_row_i[C_row_i_length] = Aval*Ivals[j];
1682  C_cols[C_row_i_length] = col;
1683  c_index[col] = C_row_i_length;
1684  C_row_i_length++;
1685 
1686  } else {
1687  // This has to be a += so insertGlobalValue goes out
1688  C_row_i[c_index[col]] += Aval*Ivals[j];
1689  }
1690  }
1691  }
1692 
1693  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1694  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1695  C_cols[ii] = bcols_import[C_cols[ii]];
1696  combined_index[last_index] = C_cols[ii];
1697  combined_values[last_index] = C_row_i[ii];
1698  last_index++;
1699  }
1700 
1701  // Now put the C_row_i values into C.
1702  // We might have to revamp this later.
1703  C_filled ?
1704  C.sumIntoGlobalValues(
1705  global_row,
1706  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1707  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1708  :
1709  C.insertGlobalValues(
1710  global_row,
1711  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1712  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1713 
1714  }
1715 }
1716 
1717 /*********************************************************************************************************/
1718 template<class Scalar,
1719  class LocalOrdinal,
1720  class GlobalOrdinal,
1721  class Node>
1722 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1723  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1724  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1725 
1726  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1727  Mview.maxNumRowEntries = Mview.indices[0].size();
1728 
1729  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1730  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1731  Mview.maxNumRowEntries = Mview.indices[i].size();
1732  }
1733 }
1734 
1735 /*********************************************************************************************************/
1736 template<class CrsMatrixType>
1737 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1738  // Follows the NZ estimate in ML's ml_matmatmult.c
1739  size_t Aest = 100, Best=100;
1740  if (A.getNodeNumEntries() > 0)
1741  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1742  if (B.getNodeNumEntries() > 0)
1743  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1744 
1745  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1746  nnzperrow *= nnzperrow;
1747 
1748  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1749 }
1750 
1751 
1752 /*********************************************************************************************************/
1753 // Kernel method for computing the local portion of C = A*B
1754 //
1755 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1756 // function, so this is probably the function we want to
1757 // thread-parallelize.
1758 template<class Scalar,
1759  class LocalOrdinal,
1760  class GlobalOrdinal,
1761  class Node>
1762 void mult_A_B_newmatrix(
1763  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1764  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1765  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1766  const std::string& label,
1767  const Teuchos::RCP<Teuchos::ParameterList>& params)
1768 {
1769  using Teuchos::Array;
1770  using Teuchos::ArrayRCP;
1771  using Teuchos::ArrayView;
1772  using Teuchos::RCP;
1773  using Teuchos::rcp;
1774 
1775  // Tpetra typedefs
1776  typedef LocalOrdinal LO;
1777  typedef GlobalOrdinal GO;
1778  typedef Node NO;
1779  typedef Import<LO,GO,NO> import_type;
1780  typedef Map<LO,GO,NO> map_type;
1781 
1782  // Kokkos typedefs
1783  typedef typename map_type::local_map_type local_map_type;
1785  typedef typename KCRS::StaticCrsGraphType graph_t;
1786  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1787  typedef typename NO::execution_space execution_space;
1788  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1789  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1790 
1791 #ifdef HAVE_TPETRA_MMM_TIMINGS
1792  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1793  using Teuchos::TimeMonitor;
1794  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1795 #endif
1796  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1797 
1798  // Build the final importer / column map, hash table lookups for C
1799  RCP<const import_type> Cimport;
1800  RCP<const map_type> Ccolmap;
1801  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1802  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1803  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1804  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1805  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1806  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1807  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1808  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1809 
1810  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1811  // indices of B, to local column indices of C. (B and C have the
1812  // same number of columns.) The kernel uses this, instead of
1813  // copying the entire input matrix B and converting its column
1814  // indices to those of C.
1815  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1816 
1817  if (Bview.importMatrix.is_null()) {
1818  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1819  Cimport = Bimport;
1820  Ccolmap = Bview.colMap;
1821  const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1822  // Bcol2Ccol is trivial
1823  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1824  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1825  KOKKOS_LAMBDA(const LO i) {
1826  Bcol2Ccol(i) = i;
1827  });
1828  }
1829  else {
1830  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1831  // column Map of C, as well as C's Import object (from its domain
1832  // Map to its column Map). C's column Map is the union of the
1833  // column Maps of (the local part of) B, and the "remote" part of
1834  // B. Ditto for the Import. We have optimized this "setUnion"
1835  // operation on Import objects and Maps.
1836 
1837  // Choose the right variant of setUnion
1838  if (!Bimport.is_null() && !Iimport.is_null()) {
1839  Cimport = Bimport->setUnion(*Iimport);
1840  }
1841  else if (!Bimport.is_null() && Iimport.is_null()) {
1842  Cimport = Bimport->setUnion();
1843  }
1844  else if (Bimport.is_null() && !Iimport.is_null()) {
1845  Cimport = Iimport->setUnion();
1846  }
1847  else {
1848  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1849  }
1850  Ccolmap = Cimport->getTargetMap();
1851 
1852  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1853  // in general. We should get rid of it in order to reduce
1854  // communication costs of sparse matrix-matrix multiply.
1855  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1856  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1857 
1858  // NOTE: This is not efficient and should be folded into setUnion
1859  //
1860  // mfh 27 Sep 2016: What the above comment means, is that the
1861  // setUnion operation on Import objects could also compute these
1862  // local index - to - local index look-up tables.
1863  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1864  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1865  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1866  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1867  });
1868  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1869  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1870  });
1871 
1872  }
1873 
1874  // Replace the column map
1875  //
1876  // mfh 27 Sep 2016: We do this because C was originally created
1877  // without a column Map. Now we have its column Map.
1878  C.replaceColMap(Ccolmap);
1879 
1880  // mfh 27 Sep 2016: Construct tables that map from local column
1881  // indices of A, to local row indices of either B_local (the locally
1882  // owned part of B), or B_remote (the "imported" remote part of B).
1883  //
1884  // For column index Aik in row i of A, if the corresponding row of B
1885  // exists in the local part of B ("orig") (which I'll call B_local),
1886  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1887  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1888  //
1889  // For column index Aik in row i of A, if the corresponding row of B
1890  // exists in the remote part of B ("Import") (which I'll call
1891  // B_remote), then targetMapToImportRow[Aik] is the local index of
1892  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1893  // (a flag value).
1894 
1895  // Run through all the hash table lookups once and for all
1896  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1897  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1898  Kokkos::fence();
1899  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1900  GO aidx = Acolmap_local.getGlobalElement(i);
1901  LO B_LID = Browmap_local.getLocalElement(aidx);
1902  if (B_LID != LO_INVALID) {
1903  targetMapToOrigRow(i) = B_LID;
1904  targetMapToImportRow(i) = LO_INVALID;
1905  } else {
1906  LO I_LID = Irowmap_local.getLocalElement(aidx);
1907  targetMapToOrigRow(i) = LO_INVALID;
1908  targetMapToImportRow(i) = I_LID;
1909 
1910  }
1911  });
1912 
1913 #ifdef HAVE_TPETRA_MMM_TIMINGS
1914  MM=Teuchos::null;
1915 #endif
1916 
1917  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1918  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1919  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1920 
1921 }
1922 
1923 
1924 /*********************************************************************************************************/
1925 // AB NewMatrix Kernel wrappers (Default non-threaded version)
1926 template<class Scalar,
1927  class LocalOrdinal,
1928  class GlobalOrdinal,
1929  class Node,
1930  class LocalOrdinalViewType>
1931 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1932  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1933  const LocalOrdinalViewType & targetMapToOrigRow,
1934  const LocalOrdinalViewType & targetMapToImportRow,
1935  const LocalOrdinalViewType & Bcol2Ccol,
1936  const LocalOrdinalViewType & Icol2Ccol,
1937  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1938  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1939  const std::string& label,
1940  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1941 #ifdef HAVE_TPETRA_MMM_TIMINGS
1942  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1943  using Teuchos::TimeMonitor;
1944  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1945  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
1946 #endif
1947 
1948  using Teuchos::Array;
1949  using Teuchos::ArrayRCP;
1950  using Teuchos::ArrayView;
1951  using Teuchos::RCP;
1952  using Teuchos::rcp;
1953 
1954  // Lots and lots of typedefs
1956  typedef typename KCRS::StaticCrsGraphType graph_t;
1957  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1958  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1959  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1960  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1961 
1962  typedef Scalar SC;
1963  typedef LocalOrdinal LO;
1964  typedef GlobalOrdinal GO;
1965  typedef Node NO;
1966  typedef Map<LO,GO,NO> map_type;
1967  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1968  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1969  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1970 
1971  // Sizes
1972  RCP<const map_type> Ccolmap = C.getColMap();
1973  size_t m = Aview.origMatrix->getNodeNumRows();
1974  size_t n = Ccolmap->getNodeNumElements();
1975  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1976 
1977  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1978  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1979  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
1980 
1981  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1982  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1983  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1984 
1985  c_lno_view_t Irowptr;
1986  lno_nnz_view_t Icolind;
1987  scalar_view_t Ivals;
1988  if(!Bview.importMatrix.is_null()) {
1989  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
1990  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
1991  Ivals = Bview.importMatrix->getLocalMatrix().values;
1992  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1993  }
1994 
1995 #ifdef HAVE_TPETRA_MMM_TIMINGS
1996  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
1997 #endif
1998 
1999 
2000  // Classic csr assembly (low memory edition)
2001  //
2002  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2003  // The method loops over rows of A, and may resize after processing
2004  // each row. Chris Siefert says that this reflects experience in
2005  // ML; for the non-threaded case, ML found it faster to spend less
2006  // effort on estimation and risk an occasional reallocation.
2007  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2008  lno_view_t Crowptr("Crowptr",m+1);
2009  lno_nnz_view_t Ccolind("Ccolind",CSR_alloc);
2010  scalar_view_t Cvals("Cvals",CSR_alloc);
2011 
2012  // mfh 27 Sep 2016: The c_status array is an implementation detail
2013  // of the local sparse matrix-matrix multiply routine.
2014 
2015  // The status array will contain the index into colind where this entry was last deposited.
2016  // c_status[i] < CSR_ip - not in the row yet
2017  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2018  // We start with this filled with INVALID's indicating that there are no entries yet.
2019  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2020  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2021  std::vector<size_t> c_status(n, ST_INVALID);
2022 
2023  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2024  // routine. The routine computes C := A * (B_local + B_remote).
2025  //
2026  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2027  // you whether the corresponding row of B belongs to B_local
2028  // ("orig") or B_remote ("Import").
2029 
2030  // For each row of A/C
2031  size_t CSR_ip = 0, OLD_ip = 0;
2032  for (size_t i = 0; i < m; i++) {
2033  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2034  // on the calling process.
2035  Crowptr[i] = CSR_ip;
2036 
2037  // mfh 27 Sep 2016: For each entry of A in the current row of A
2038  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2039  LO Aik = Acolind[k]; // local column index of current entry of A
2040  const SC Aval = Avals[k]; // value of current entry of A
2041  if (Aval == SC_ZERO)
2042  continue; // skip explicitly stored zero values in A
2043 
2044  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2045  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2046  // corresponding to the current entry of A is populated, then
2047  // the corresponding row of B is in B_local (i.e., it lives on
2048  // the calling process).
2049 
2050  // Local matrix
2051  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2052 
2053  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2054  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2055  LO Bkj = Bcolind[j];
2056  LO Cij = Bcol2Ccol[Bkj];
2057 
2058  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2059  // New entry
2060  c_status[Cij] = CSR_ip;
2061  Ccolind[CSR_ip] = Cij;
2062  Cvals[CSR_ip] = Aval*Bvals[j];
2063  CSR_ip++;
2064 
2065  } else {
2066  Cvals[c_status[Cij]] += Aval*Bvals[j];
2067  }
2068  }
2069 
2070  } else {
2071  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2072  // corresponding to the current entry of A NOT populated (has
2073  // a flag "invalid" value), then the corresponding row of B is
2074  // in B_local (i.e., it lives on the calling process).
2075 
2076  // Remote matrix
2077  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2078  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2079  LO Ikj = Icolind[j];
2080  LO Cij = Icol2Ccol[Ikj];
2081 
2082  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2083  // New entry
2084  c_status[Cij] = CSR_ip;
2085  Ccolind[CSR_ip] = Cij;
2086  Cvals[CSR_ip] = Aval*Ivals[j];
2087  CSR_ip++;
2088  } else {
2089  Cvals[c_status[Cij]] += Aval*Ivals[j];
2090  }
2091  }
2092  }
2093  }
2094 
2095  // Resize for next pass if needed
2096  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2097  CSR_alloc *= 2;
2098  Kokkos::resize(Ccolind,CSR_alloc);
2099  Kokkos::resize(Cvals,CSR_alloc);
2100  }
2101  OLD_ip = CSR_ip;
2102  }
2103 
2104  Crowptr[m] = CSR_ip;
2105 
2106  // Downward resize
2107  Kokkos::resize(Ccolind,CSR_ip);
2108  Kokkos::resize(Cvals,CSR_ip);
2109 
2110 #ifdef HAVE_TPETRA_MMM_TIMINGS
2111  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort"))));
2112  MM2 = Teuchos::null;
2113 #endif
2114 
2115  // Final sort & set of CRS arrays
2116  if (params.is_null() || params->get("sort entries",true))
2117  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2118  C.setAllValues(Crowptr,Ccolind, Cvals);
2119 
2120 
2121 #ifdef HAVE_TPETRA_MMM_TIMINGS
2122  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC"))));
2123 #endif
2124 
2125  // Final FillComplete
2126  //
2127  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2128  // Import (from domain Map to column Map) construction (which costs
2129  // lots of communication) by taking the previously constructed
2130  // Import object. We should be able to do this without interfering
2131  // with the implementation of the local part of sparse matrix-matrix
2132  // multply above.
2133  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2134  labelList->set("Timer Label",label);
2135  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2136  RCP<const Export<LO,GO,NO> > dummyExport;
2137  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2138 
2139 }
2140 
2141 
2142 
2143 
2144 /*********************************************************************************************************/
2145 // Kernel method for computing the local portion of C = A*B (reuse)
2146 template<class Scalar,
2147  class LocalOrdinal,
2148  class GlobalOrdinal,
2149  class Node>
2150 void mult_A_B_reuse(
2151  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2152  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2153  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2154  const std::string& label,
2155  const Teuchos::RCP<Teuchos::ParameterList>& params)
2156 {
2157  using Teuchos::Array;
2158  using Teuchos::ArrayRCP;
2159  using Teuchos::ArrayView;
2160  using Teuchos::RCP;
2161  using Teuchos::rcp;
2162 
2163  // Tpetra typedefs
2164  typedef LocalOrdinal LO;
2165  typedef GlobalOrdinal GO;
2166  typedef Node NO;
2167  typedef Import<LO,GO,NO> import_type;
2168  typedef Map<LO,GO,NO> map_type;
2169 
2170  // Kokkos typedefs
2171  typedef typename map_type::local_map_type local_map_type;
2173  typedef typename KCRS::StaticCrsGraphType graph_t;
2174  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2175  typedef typename NO::execution_space execution_space;
2176  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2177  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2178 
2179 #ifdef HAVE_TPETRA_MMM_TIMINGS
2180  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2181  using Teuchos::TimeMonitor;
2182  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2183 #endif
2184  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2185 
2186  // Grab all the maps
2187  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2188  RCP<const map_type> Ccolmap = C.getColMap();
2189  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2190  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2191  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2192  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2193  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2194  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2195 
2196  // Build the final importer / column map, hash table lookups for C
2197  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2198  {
2199  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2200  // So, column map of C may be a strict subset of the column map of B
2201  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2202  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2203  });
2204 
2205  if (!Bview.importMatrix.is_null()) {
2206  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2207  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2208 
2209  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2210  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2211  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2212  });
2213  }
2214  }
2215 
2216  // Run through all the hash table lookups once and for all
2217  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2218  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2219  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2220  GO aidx = Acolmap_local.getGlobalElement(i);
2221  LO B_LID = Browmap_local.getLocalElement(aidx);
2222  if (B_LID != LO_INVALID) {
2223  targetMapToOrigRow(i) = B_LID;
2224  targetMapToImportRow(i) = LO_INVALID;
2225  } else {
2226  LO I_LID = Irowmap_local.getLocalElement(aidx);
2227  targetMapToOrigRow(i) = LO_INVALID;
2228  targetMapToImportRow(i) = I_LID;
2229 
2230  }
2231  });
2232 
2233 #ifdef HAVE_TPETRA_MMM_TIMINGS
2234  MM = Teuchos::null;
2235 #endif
2236 
2237  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2238  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2239  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2240 }
2241 
2242 /*********************************************************************************************************/
2243 template<class Scalar,
2244  class LocalOrdinal,
2245  class GlobalOrdinal,
2246  class Node,
2247  class LocalOrdinalViewType>
2248 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2249  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2250  const LocalOrdinalViewType & targetMapToOrigRow,
2251  const LocalOrdinalViewType & targetMapToImportRow,
2252  const LocalOrdinalViewType & Bcol2Ccol,
2253  const LocalOrdinalViewType & Icol2Ccol,
2254  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2255  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2256  const std::string& label,
2257  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2258 #ifdef HAVE_TPETRA_MMM_TIMINGS
2259  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2260  using Teuchos::TimeMonitor;
2261  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2262  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2263 #endif
2264  using Teuchos::RCP;
2265  using Teuchos::rcp;
2266 
2267 
2268  // Lots and lots of typedefs
2270  typedef typename KCRS::StaticCrsGraphType graph_t;
2271  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2272  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2273  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2274 
2275  typedef Scalar SC;
2276  typedef LocalOrdinal LO;
2277  typedef GlobalOrdinal GO;
2278  typedef Node NO;
2279  typedef Map<LO,GO,NO> map_type;
2280  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2281  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2282  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2283 
2284  // Sizes
2285  RCP<const map_type> Ccolmap = C.getColMap();
2286  size_t m = Aview.origMatrix->getNodeNumRows();
2287  size_t n = Ccolmap->getNodeNumElements();
2288 
2289  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2290  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2291  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2292  const KCRS & Cmat = C.getLocalMatrix();
2293 
2294  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2295  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2296  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2297  scalar_view_t Cvals = Cmat.values;
2298 
2299  c_lno_view_t Irowptr;
2300  lno_nnz_view_t Icolind;
2301  scalar_view_t Ivals;
2302  if(!Bview.importMatrix.is_null()) {
2303  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2304  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2305  Ivals = Bview.importMatrix->getLocalMatrix().values;
2306  }
2307 
2308 #ifdef HAVE_TPETRA_MMM_TIMINGS
2309  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2310 #endif
2311 
2312  // Classic csr assembly (low memory edition)
2313  // mfh 27 Sep 2016: The c_status array is an implementation detail
2314  // of the local sparse matrix-matrix multiply routine.
2315 
2316  // The status array will contain the index into colind where this entry was last deposited.
2317  // c_status[i] < CSR_ip - not in the row yet
2318  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2319  // We start with this filled with INVALID's indicating that there are no entries yet.
2320  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2321  std::vector<size_t> c_status(n, ST_INVALID);
2322 
2323  // For each row of A/C
2324  size_t CSR_ip = 0, OLD_ip = 0;
2325  for (size_t i = 0; i < m; i++) {
2326  // First fill the c_status array w/ locations where we're allowed to
2327  // generate nonzeros for this row
2328  OLD_ip = Crowptr[i];
2329  CSR_ip = Crowptr[i+1];
2330  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2331  c_status[Ccolind[k]] = k;
2332 
2333  // Reset values in the row of C
2334  Cvals[k] = SC_ZERO;
2335  }
2336 
2337  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2338  LO Aik = Acolind[k];
2339  const SC Aval = Avals[k];
2340  if (Aval == SC_ZERO)
2341  continue;
2342 
2343  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2344  // Local matrix
2345  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2346 
2347  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2348  LO Bkj = Bcolind[j];
2349  LO Cij = Bcol2Ccol[Bkj];
2350 
2351  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2352  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2353  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2354 
2355  Cvals[c_status[Cij]] += Aval * Bvals[j];
2356  }
2357 
2358  } else {
2359  // Remote matrix
2360  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2361  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2362  LO Ikj = Icolind[j];
2363  LO Cij = Icol2Ccol[Ikj];
2364 
2365  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2366  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2367  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2368 
2369  Cvals[c_status[Cij]] += Aval * Ivals[j];
2370  }
2371  }
2372  }
2373  }
2374 
2375 #ifdef HAVE_TPETRA_MMM_TIMINGS
2376  MM2= Teuchos::null;
2377  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2378 #endif
2379 
2380  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2381 }
2382 
2383 
2384 /*********************************************************************************************************/
2385 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2386 template<class Scalar,
2387  class LocalOrdinal,
2388  class GlobalOrdinal,
2389  class Node>
2390 void jacobi_A_B_newmatrix(
2391  Scalar omega,
2392  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2393  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2394  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2395  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2396  const std::string& label,
2397  const Teuchos::RCP<Teuchos::ParameterList>& params)
2398 {
2399  using Teuchos::Array;
2400  using Teuchos::ArrayRCP;
2401  using Teuchos::ArrayView;
2402  using Teuchos::RCP;
2403  using Teuchos::rcp;
2404  // typedef Scalar SC;
2405  typedef LocalOrdinal LO;
2406  typedef GlobalOrdinal GO;
2407  typedef Node NO;
2408 
2409  typedef Import<LO,GO,NO> import_type;
2410  typedef Map<LO,GO,NO> map_type;
2411  typedef typename map_type::local_map_type local_map_type;
2412 
2413  // All of the Kokkos typedefs
2415  typedef typename KCRS::StaticCrsGraphType graph_t;
2416  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2417  typedef typename NO::execution_space execution_space;
2418  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2419  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2420 
2421 
2422 #ifdef HAVE_TPETRA_MMM_TIMINGS
2423  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2424  using Teuchos::TimeMonitor;
2425  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2426 #endif
2427  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2428 
2429  // Build the final importer / column map, hash table lookups for C
2430  RCP<const import_type> Cimport;
2431  RCP<const map_type> Ccolmap;
2432  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2433  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2434  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2435  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2436  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2437  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2438  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2439  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2440 
2441  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2442  // indices of B, to local column indices of C. (B and C have the
2443  // same number of columns.) The kernel uses this, instead of
2444  // copying the entire input matrix B and converting its column
2445  // indices to those of C.
2446  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2447 
2448  if (Bview.importMatrix.is_null()) {
2449  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2450  Cimport = Bimport;
2451  Ccolmap = Bview.colMap;
2452  // Bcol2Ccol is trivial
2453  // Bcol2Ccol is trivial
2454 
2455  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2456  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2457  Bcol2Ccol(i) = static_cast<LO> (i);
2458  });
2459  } else {
2460  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2461  // column Map of C, as well as C's Import object (from its domain
2462  // Map to its column Map). C's column Map is the union of the
2463  // column Maps of (the local part of) B, and the "remote" part of
2464  // B. Ditto for the Import. We have optimized this "setUnion"
2465  // operation on Import objects and Maps.
2466 
2467  // Choose the right variant of setUnion
2468  if (!Bimport.is_null() && !Iimport.is_null()){
2469  Cimport = Bimport->setUnion(*Iimport);
2470  Ccolmap = Cimport->getTargetMap();
2471 
2472  } else if (!Bimport.is_null() && Iimport.is_null()) {
2473  Cimport = Bimport->setUnion();
2474 
2475  } else if(Bimport.is_null() && !Iimport.is_null()) {
2476  Cimport = Iimport->setUnion();
2477 
2478  } else
2479  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2480 
2481  Ccolmap = Cimport->getTargetMap();
2482 
2483  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2484  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2485 
2486  // NOTE: This is not efficient and should be folded into setUnion
2487  //
2488  // mfh 27 Sep 2016: What the above comment means, is that the
2489  // setUnion operation on Import objects could also compute these
2490  // local index - to - local index look-up tables.
2491  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2492  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2493  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2494  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2495  });
2496  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2497  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2498  });
2499 
2500  }
2501 
2502  // Replace the column map
2503  //
2504  // mfh 27 Sep 2016: We do this because C was originally created
2505  // without a column Map. Now we have its column Map.
2506  C.replaceColMap(Ccolmap);
2507 
2508  // mfh 27 Sep 2016: Construct tables that map from local column
2509  // indices of A, to local row indices of either B_local (the locally
2510  // owned part of B), or B_remote (the "imported" remote part of B).
2511  //
2512  // For column index Aik in row i of A, if the corresponding row of B
2513  // exists in the local part of B ("orig") (which I'll call B_local),
2514  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2515  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2516  //
2517  // For column index Aik in row i of A, if the corresponding row of B
2518  // exists in the remote part of B ("Import") (which I'll call
2519  // B_remote), then targetMapToImportRow[Aik] is the local index of
2520  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2521  // (a flag value).
2522 
2523  // Run through all the hash table lookups once and for all
2524  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2525  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2526  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2527  GO aidx = Acolmap_local.getGlobalElement(i);
2528  LO B_LID = Browmap_local.getLocalElement(aidx);
2529  if (B_LID != LO_INVALID) {
2530  targetMapToOrigRow(i) = B_LID;
2531  targetMapToImportRow(i) = LO_INVALID;
2532  } else {
2533  LO I_LID = Irowmap_local.getLocalElement(aidx);
2534  targetMapToOrigRow(i) = LO_INVALID;
2535  targetMapToImportRow(i) = I_LID;
2536 
2537  }
2538  });
2539 
2540  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2541  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2542  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2543 
2544 }
2545 
2546 
2547 /*********************************************************************************************************/
2548 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2549 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2550 
2551 template<class Scalar,
2552  class LocalOrdinal,
2553  class GlobalOrdinal,
2554  class Node,
2555  class LocalOrdinalViewType>
2556 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2557  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2558  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2559  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2560  const LocalOrdinalViewType & targetMapToOrigRow,
2561  const LocalOrdinalViewType & targetMapToImportRow,
2562  const LocalOrdinalViewType & Bcol2Ccol,
2563  const LocalOrdinalViewType & Icol2Ccol,
2564  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2565  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2566  const std::string& label,
2567  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2568 
2569 #ifdef HAVE_TPETRA_MMM_TIMINGS
2570  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2571  using Teuchos::TimeMonitor;
2572  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore"))));
2573 
2574 #endif
2575 
2576  using Teuchos::Array;
2577  using Teuchos::ArrayRCP;
2578  using Teuchos::ArrayView;
2579  using Teuchos::RCP;
2580  using Teuchos::rcp;
2581 
2582  // Lots and lots of typedefs
2584  typedef typename KCRS::StaticCrsGraphType graph_t;
2585  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2586  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2587  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2588  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2589 
2590  // Jacobi-specific
2591  typedef typename scalar_view_t::memory_space scalar_memory_space;
2592 
2593  typedef Scalar SC;
2594  typedef LocalOrdinal LO;
2595  typedef GlobalOrdinal GO;
2596  typedef Node NO;
2597 
2598  typedef Map<LO,GO,NO> map_type;
2599  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2600  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2601 
2602  // Sizes
2603  RCP<const map_type> Ccolmap = C.getColMap();
2604  size_t m = Aview.origMatrix->getNodeNumRows();
2605  size_t n = Ccolmap->getNodeNumElements();
2606  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2607 
2608  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2609  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2610  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2611 
2612  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2613  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2614  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2615 
2616  c_lno_view_t Irowptr;
2617  lno_nnz_view_t Icolind;
2618  scalar_view_t Ivals;
2619  if(!Bview.importMatrix.is_null()) {
2620  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2621  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2622  Ivals = Bview.importMatrix->getLocalMatrix().values;
2623  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2624  }
2625 
2626  // Jacobi-specific inner stuff
2627  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2628 
2629  // Teuchos::ArrayView::operator[].
2630  // The status array will contain the index into colind where this entry was last deposited.
2631  // c_status[i] < CSR_ip - not in the row yet.
2632  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2633  // We start with this filled with INVALID's indicating that there are no entries yet.
2634  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2635  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2636  Array<size_t> c_status(n, ST_INVALID);
2637 
2638  // Classic csr assembly (low memory edition)
2639  //
2640  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2641  // The method loops over rows of A, and may resize after processing
2642  // each row. Chris Siefert says that this reflects experience in
2643  // ML; for the non-threaded case, ML found it faster to spend less
2644  // effort on estimation and risk an occasional reallocation.
2645  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2646  lno_view_t Crowptr("Crowptr",m+1);
2647  lno_nnz_view_t Ccolind("Ccolind",CSR_alloc);
2648  scalar_view_t Cvals("Cvals",CSR_alloc);
2649  size_t CSR_ip = 0, OLD_ip = 0;
2650 
2651  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2652 
2653  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2654  // routine. The routine computes
2655  //
2656  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2657  //
2658  // This corresponds to one sweep of (weighted) Jacobi.
2659  //
2660  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2661  // you whether the corresponding row of B belongs to B_local
2662  // ("orig") or B_remote ("Import").
2663 
2664  // For each row of A/C
2665  for (size_t i = 0; i < m; i++) {
2666  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2667  // on the calling process.
2668  Crowptr[i] = CSR_ip;
2669  SC minusOmegaDval = -omega*Dvals(i,0);
2670 
2671  // Entries of B
2672  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2673  Scalar Bval = Bvals[j];
2674  if (Bval == SC_ZERO)
2675  continue;
2676  LO Bij = Bcolind[j];
2677  LO Cij = Bcol2Ccol[Bij];
2678 
2679  // Assume no repeated entries in B
2680  c_status[Cij] = CSR_ip;
2681  Ccolind[CSR_ip] = Cij;
2682  Cvals[CSR_ip] = Bvals[j];
2683  CSR_ip++;
2684  }
2685 
2686  // Entries of -omega * Dinv * A * B
2687  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2688  LO Aik = Acolind[k];
2689  const SC Aval = Avals[k];
2690  if (Aval == SC_ZERO)
2691  continue;
2692 
2693  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2694  // Local matrix
2695  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2696 
2697  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2698  LO Bkj = Bcolind[j];
2699  LO Cij = Bcol2Ccol[Bkj];
2700 
2701  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2702  // New entry
2703  c_status[Cij] = CSR_ip;
2704  Ccolind[CSR_ip] = Cij;
2705  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2706  CSR_ip++;
2707 
2708  } else {
2709  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2710  }
2711  }
2712 
2713  } else {
2714  // Remote matrix
2715  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2716  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2717  LO Ikj = Icolind[j];
2718  LO Cij = Icol2Ccol[Ikj];
2719 
2720  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2721  // New entry
2722  c_status[Cij] = CSR_ip;
2723  Ccolind[CSR_ip] = Cij;
2724  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2725  CSR_ip++;
2726  } else {
2727  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2728  }
2729  }
2730  }
2731  }
2732 
2733  // Resize for next pass if needed
2734  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2735  CSR_alloc *= 2;
2736  Kokkos::resize(Ccolind,CSR_alloc);
2737  Kokkos::resize(Cvals,CSR_alloc);
2738  }
2739  OLD_ip = CSR_ip;
2740  }
2741  Crowptr[m] = CSR_ip;
2742 
2743  // Downward resize
2744  Kokkos::resize(Ccolind,CSR_ip);
2745  Kokkos::resize(Cvals,CSR_ip);
2746 
2747 
2748 
2749 #ifdef HAVE_TPETRA_MMM_TIMINGS
2750  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort"))));
2751 #endif
2752 
2753  // Replace the column map
2754  //
2755  // mfh 27 Sep 2016: We do this because C was originally created
2756  // without a column Map. Now we have its column Map.
2757  C.replaceColMap(Ccolmap);
2758 
2759  // Final sort & set of CRS arrays
2760  //
2761  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2762  // matrix-matrix multiply routine sort the entries for us?
2763  // Final sort & set of CRS arrays
2764  if (params.is_null() || params->get("sort entries",true))
2765  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2766  C.setAllValues(Crowptr,Ccolind, Cvals);
2767 
2768 #ifdef HAVE_TPETRA_MMM_TIMINGS
2769  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC"))));
2770 #endif
2771 
2772  // Final FillComplete
2773  //
2774  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2775  // Import (from domain Map to column Map) construction (which costs
2776  // lots of communication) by taking the previously constructed
2777  // Import object. We should be able to do this without interfering
2778  // with the implementation of the local part of sparse matrix-matrix
2779  // multply above
2780  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2781  labelList->set("Timer Label",label);
2782  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2783  RCP<const Export<LO,GO,NO> > dummyExport;
2784  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2785 }
2786 
2787 
2788 /*********************************************************************************************************/
2789 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2790 template<class Scalar,
2791  class LocalOrdinal,
2792  class GlobalOrdinal,
2793  class Node>
2794 void jacobi_A_B_reuse(
2795  Scalar omega,
2796  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2797  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2798  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2799  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2800  const std::string& label,
2801  const Teuchos::RCP<Teuchos::ParameterList>& params)
2802 {
2803  using Teuchos::Array;
2804  using Teuchos::ArrayRCP;
2805  using Teuchos::ArrayView;
2806  using Teuchos::RCP;
2807  using Teuchos::rcp;
2808 
2809  typedef LocalOrdinal LO;
2810  typedef GlobalOrdinal GO;
2811  typedef Node NO;
2812 
2813  typedef Import<LO,GO,NO> import_type;
2814  typedef Map<LO,GO,NO> map_type;
2815 
2816  // Kokkos typedefs
2817  typedef typename map_type::local_map_type local_map_type;
2819  typedef typename KCRS::StaticCrsGraphType graph_t;
2820  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2821  typedef typename NO::execution_space execution_space;
2822  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2823  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2824 
2825 #ifdef HAVE_TPETRA_MMM_TIMINGS
2826  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2827  using Teuchos::TimeMonitor;
2828  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2829 #endif
2830  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2831 
2832  // Grab all the maps
2833  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2834  RCP<const map_type> Ccolmap = C.getColMap();
2835  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2836  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2837  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2838  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2839  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2840  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2841 
2842  // Build the final importer / column map, hash table lookups for C
2843  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2844  {
2845  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2846  // So, column map of C may be a strict subset of the column map of B
2847  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2848  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2849  });
2850 
2851  if (!Bview.importMatrix.is_null()) {
2852  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2853  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2854 
2855  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2856  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2857  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2858  });
2859  }
2860  }
2861 
2862  // Run through all the hash table lookups once and for all
2863  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2864  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2865  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2866  GO aidx = Acolmap_local.getGlobalElement(i);
2867  LO B_LID = Browmap_local.getLocalElement(aidx);
2868  if (B_LID != LO_INVALID) {
2869  targetMapToOrigRow(i) = B_LID;
2870  targetMapToImportRow(i) = LO_INVALID;
2871  } else {
2872  LO I_LID = Irowmap_local.getLocalElement(aidx);
2873  targetMapToOrigRow(i) = LO_INVALID;
2874  targetMapToImportRow(i) = I_LID;
2875 
2876  }
2877  });
2878 
2879 #ifdef HAVE_TPETRA_MMM_TIMINGS
2880  MM = Teuchos::null;
2881 #endif
2882 
2883  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2884  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2885  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2886 }
2887 
2888 
2889 
2890 /*********************************************************************************************************/
2891 template<class Scalar,
2892  class LocalOrdinal,
2893  class GlobalOrdinal,
2894  class Node,
2895  class LocalOrdinalViewType>
2896 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2897  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2898  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2899  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2900  const LocalOrdinalViewType & targetMapToOrigRow,
2901  const LocalOrdinalViewType & targetMapToImportRow,
2902  const LocalOrdinalViewType & Bcol2Ccol,
2903  const LocalOrdinalViewType & Icol2Ccol,
2904  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2905  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2906  const std::string& label,
2907  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2908 #ifdef HAVE_TPETRA_MMM_TIMINGS
2909  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2910  using Teuchos::TimeMonitor;
2911  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2912  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2913 #endif
2914  using Teuchos::RCP;
2915  using Teuchos::rcp;
2916 
2917  // Lots and lots of typedefs
2919  typedef typename KCRS::StaticCrsGraphType graph_t;
2920  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2921  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2922  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2923  typedef typename scalar_view_t::memory_space scalar_memory_space;
2924 
2925  typedef Scalar SC;
2926  typedef LocalOrdinal LO;
2927  typedef GlobalOrdinal GO;
2928  typedef Node NO;
2929  typedef Map<LO,GO,NO> map_type;
2930  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2931  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2932  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2933 
2934  // Sizes
2935  RCP<const map_type> Ccolmap = C.getColMap();
2936  size_t m = Aview.origMatrix->getNodeNumRows();
2937  size_t n = Ccolmap->getNodeNumElements();
2938 
2939  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2940  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2941  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2942  const KCRS & Cmat = C.getLocalMatrix();
2943 
2944  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2945  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2946  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2947  scalar_view_t Cvals = Cmat.values;
2948 
2949  c_lno_view_t Irowptr;
2950  lno_nnz_view_t Icolind;
2951  scalar_view_t Ivals;
2952  if(!Bview.importMatrix.is_null()) {
2953  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2954  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2955  Ivals = Bview.importMatrix->getLocalMatrix().values;
2956  }
2957 
2958  // Jacobi-specific inner stuff
2959  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2960 
2961 #ifdef HAVE_TPETRA_MMM_TIMINGS
2962  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
2963 #endif
2964 
2965  // The status array will contain the index into colind where this entry was last deposited.
2966  // c_status[i] < CSR_ip - not in the row yet
2967  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2968  // We start with this filled with INVALID's indicating that there are no entries yet.
2969  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2970  std::vector<size_t> c_status(n, ST_INVALID);
2971 
2972  // For each row of A/C
2973  size_t CSR_ip = 0, OLD_ip = 0;
2974  for (size_t i = 0; i < m; i++) {
2975 
2976  // First fill the c_status array w/ locations where we're allowed to
2977  // generate nonzeros for this row
2978  OLD_ip = Crowptr[i];
2979  CSR_ip = Crowptr[i+1];
2980  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2981  c_status[Ccolind[k]] = k;
2982 
2983  // Reset values in the row of C
2984  Cvals[k] = SC_ZERO;
2985  }
2986 
2987  SC minusOmegaDval = -omega*Dvals(i,0);
2988 
2989  // Entries of B
2990  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2991  Scalar Bval = Bvals[j];
2992  if (Bval == SC_ZERO)
2993  continue;
2994  LO Bij = Bcolind[j];
2995  LO Cij = Bcol2Ccol[Bij];
2996 
2997  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2998  std::runtime_error, "Trying to insert a new entry into a static graph");
2999 
3000  Cvals[c_status[Cij]] = Bvals[j];
3001  }
3002 
3003  // Entries of -omega * Dinv * A * B
3004  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3005  LO Aik = Acolind[k];
3006  const SC Aval = Avals[k];
3007  if (Aval == SC_ZERO)
3008  continue;
3009 
3010  if (targetMapToOrigRow[Aik] != LO_INVALID) {
3011  // Local matrix
3012  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3013 
3014  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3015  LO Bkj = Bcolind[j];
3016  LO Cij = Bcol2Ccol[Bkj];
3017 
3018  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3019  std::runtime_error, "Trying to insert a new entry into a static graph");
3020 
3021  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3022  }
3023 
3024  } else {
3025  // Remote matrix
3026  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3027  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3028  LO Ikj = Icolind[j];
3029  LO Cij = Icol2Ccol[Ikj];
3030 
3031  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3032  std::runtime_error, "Trying to insert a new entry into a static graph");
3033 
3034  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3035  }
3036  }
3037  }
3038  }
3039 
3040 #ifdef HAVE_TPETRA_MMM_TIMINGS
3041  MM2= Teuchos::null;
3042  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3043 #endif
3044 
3045  C.fillComplete(C.getDomainMap(), C.getRangeMap());
3046 }
3047 
3048 
3049 
3050 /*********************************************************************************************************/
3051 template<class Scalar,
3052  class LocalOrdinal,
3053  class GlobalOrdinal,
3054  class Node>
3055 void import_and_extract_views(
3056  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3057  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3058  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3059  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3060  bool userAssertsThereAreNoRemotes,
3061  const std::string& label,
3062  const Teuchos::RCP<Teuchos::ParameterList>& params)
3063 {
3064  using Teuchos::Array;
3065  using Teuchos::ArrayView;
3066  using Teuchos::RCP;
3067  using Teuchos::rcp;
3068  using Teuchos::null;
3069 
3070  typedef Scalar SC;
3071  typedef LocalOrdinal LO;
3072  typedef GlobalOrdinal GO;
3073  typedef Node NO;
3074 
3075  typedef Map<LO,GO,NO> map_type;
3076  typedef Import<LO,GO,NO> import_type;
3077  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3078 
3079 #ifdef HAVE_TPETRA_MMM_TIMINGS
3080  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3081  using Teuchos::TimeMonitor;
3082  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3083 #endif
3084  // The goal of this method is to populate the 'Aview' struct with views of the
3085  // rows of A, including all rows that correspond to elements in 'targetMap'.
3086  //
3087  // If targetMap includes local elements that correspond to remotely-owned rows
3088  // of A, then those remotely-owned rows will be imported into
3089  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3090  Aview.deleteContents();
3091 
3092  Aview.origMatrix = rcp(&A, false);
3093  Aview.origRowMap = A.getRowMap();
3094  Aview.rowMap = targetMap;
3095  Aview.colMap = A.getColMap();
3096  Aview.domainMap = A.getDomainMap();
3097  Aview.importColMap = null;
3098 
3099  // Short circuit if the user swears there are no remotes
3100  if (userAssertsThereAreNoRemotes)
3101  return;
3102 
3103  RCP<const import_type> importer;
3104  if (params != null && params->isParameter("importer")) {
3105  importer = params->get<RCP<const import_type> >("importer");
3106 
3107  } else {
3108 #ifdef HAVE_TPETRA_MMM_TIMINGS
3109  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3110 #endif
3111 
3112  // Mark each row in targetMap as local or remote, and go ahead and get a view
3113  // for the local rows
3114  RCP<const map_type> rowMap = A.getRowMap(), remoteRowMap;
3115  size_t numRemote = 0;
3116  int mode = 0;
3117  if (!prototypeImporter.is_null() &&
3118  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3119  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3120  // We have a valid prototype importer --- ask it for the remotes
3121  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3122  numRemote = prototypeImporter->getNumRemoteIDs();
3123 
3124  Array<GO> remoteRows(numRemote);
3125  for (size_t i = 0; i < numRemote; i++)
3126  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3127 
3128  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3129  rowMap->getIndexBase(), rowMap->getComm()));
3130  mode = 1;
3131 
3132  } else if (prototypeImporter.is_null()) {
3133  // No prototype importer --- count the remotes the hard way
3134  ArrayView<const GO> rows = targetMap->getNodeElementList();
3135  size_t numRows = targetMap->getNodeNumElements();
3136 
3137  Array<GO> remoteRows(numRows);
3138  for(size_t i = 0; i < numRows; ++i) {
3139  const LO mlid = rowMap->getLocalElement(rows[i]);
3140 
3141  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3142  remoteRows[numRemote++] = rows[i];
3143  }
3144  remoteRows.resize(numRemote);
3145  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3146  rowMap->getIndexBase(), rowMap->getComm()));
3147  mode = 2;
3148 
3149  } else {
3150  // PrototypeImporter is bad. But if we're in serial that's OK.
3151  mode = 3;
3152  }
3153 
3154  const int numProcs = rowMap->getComm()->getSize();
3155  if (numProcs < 2) {
3156  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3157  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3158  // If only one processor we don't need to import any remote rows, so return.
3159  return;
3160  }
3161 
3162  //
3163  // Now we will import the needed remote rows of A, if the global maximum
3164  // value of numRemote is greater than 0.
3165  //
3166 #ifdef HAVE_TPETRA_MMM_TIMINGS
3167  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3168 #endif
3169 
3170  global_size_t globalMaxNumRemote = 0;
3171  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3172 
3173  if (globalMaxNumRemote > 0) {
3174 #ifdef HAVE_TPETRA_MMM_TIMINGS
3175  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3176 #endif
3177  // Create an importer with target-map remoteRowMap and source-map rowMap.
3178  if (mode == 1)
3179  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3180  else if (mode == 2)
3181  importer = rcp(new import_type(rowMap, remoteRowMap));
3182  else
3183  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3184  }
3185 
3186  if (params != null)
3187  params->set("importer", importer);
3188  }
3189 
3190  if (importer != null) {
3191 #ifdef HAVE_TPETRA_MMM_TIMINGS
3192  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3193 #endif
3194 
3195  // Now create a new matrix into which we can import the remote rows of A that we need.
3196  Teuchos::ParameterList labelList;
3197  labelList.set("Timer Label", label);
3198  // Minor speedup tweak - avoid computing the global constants
3199  if(!params.is_null())
3200  labelList.set("compute global constants", params->get("compute global constants",false));
3201  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3202  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3203 
3204 #if 0
3205  // Disabled code for dumping input matrices
3206  static int count=0;
3207  char str[80];
3208  sprintf(str,"import_matrix.%d.dat",count);
3210  count++;
3211 #endif
3212 
3213 #ifdef HAVE_TPETRA_MMM_STATISTICS
3214  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3215 #endif
3216 
3217 
3218 #ifdef HAVE_TPETRA_MMM_TIMINGS
3219  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3220 #endif
3221 
3222  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3223  Aview.importColMap = Aview.importMatrix->getColMap();
3224  }
3225 }
3226 
3227 
3228 
3229 
3230 
3231 /*********************************************************************************************************/
3232  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3233 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3235 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3236  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3237  const LocalOrdinalViewType & Acol2Brow,
3238  const LocalOrdinalViewType & Acol2Irow,
3239  const LocalOrdinalViewType & Bcol2Ccol,
3240  const LocalOrdinalViewType & Icol2Ccol,
3241  const size_t mergedNodeNumCols) {
3242 
3243  using Teuchos::RCP;
3245  typedef typename KCRS::StaticCrsGraphType graph_t;
3246  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3247  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3248  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3249  // Grab the Kokkos::SparseCrsMatrices
3250  const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3251  const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3252 
3253  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3254  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3255  // We do have a Bimport
3256  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3257  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3258  RCP<const KCRS> Ik_;
3259  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3260  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3261  KCRS Iks;
3262  if(Ik!=0) Iks = *Ik;
3263  size_t merge_numrows = Ak.numCols();
3264  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3265 
3266  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3267 
3268  // Use a Kokkos::parallel_scan to build the rowptr
3269  typedef typename Node::execution_space execution_space;
3270  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3271  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3272  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3273  if(final) Mrowptr(i) = update;
3274  // Get the row count
3275  size_t ct=0;
3276  if(Acol2Brow(i)!=LO_INVALID)
3277  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3278  else
3279  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3280  update+=ct;
3281 
3282  if(final && i+1==merge_numrows)
3283  Mrowptr(i+1)=update;
3284  });
3285 
3286  // Allocate nnz
3287  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3288  lno_nnz_view_t Mcolind("Mcolind",merge_nnz);
3289  scalar_view_t Mvalues("Mvals",merge_nnz);
3290 
3291  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3292  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3293  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3294  if(Acol2Brow(i)!=LO_INVALID) {
3295  size_t row = Acol2Brow(i);
3296  size_t start = Bk.graph.row_map(row);
3297  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3298  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3299  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3300  }
3301  }
3302  else {
3303  size_t row = Acol2Irow(i);
3304  size_t start = Iks.graph.row_map(row);
3305  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3306  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3307  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3308  }
3309  }
3310  });
3311 
3312  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3313  return newmat;
3314  }
3315  else {
3316  // We don't have a Bimport (the easy case)
3317  return Bk;
3318  }
3319 }//end merge_matrices
3320 
3321 
3322 
3323 } //End namepsace MMdetails
3324 
3325 } //End namespace Tpetra
3326 
3327 /*********************************************************************************************************/
3328 //
3329 // Explicit instantiation macro
3330 //
3331 // Must be expanded from within the Tpetra namespace!
3332 //
3333 namespace Tpetra {
3334 
3335 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3336  template \
3337  void MatrixMatrix::Multiply( \
3338  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3339  bool transposeA, \
3340  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3341  bool transposeB, \
3342  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3343  bool call_FillComplete_on_result, \
3344  const std::string & label, \
3345  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3346 \
3347 template \
3348  void MatrixMatrix::Jacobi( \
3349  SCALAR omega, \
3350  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3351  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3352  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3353  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3354  bool call_FillComplete_on_result, \
3355  const std::string & label, \
3356  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3357 \
3358  template \
3359  void MatrixMatrix::Add( \
3360  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3361  bool transposeA, \
3362  SCALAR scalarA, \
3363  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3364  bool transposeB, \
3365  SCALAR scalarB, \
3366  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3367 \
3368  template \
3369  void MatrixMatrix::Add( \
3370  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3371  bool transposeA, \
3372  SCALAR scalarA, \
3373  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3374  SCALAR scalarB ); \
3375 \
3376  template \
3377  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3378  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3379  (const SCALAR & alpha, \
3380  const bool transposeA, \
3381  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3382  const SCALAR & beta, \
3383  const bool transposeB, \
3384  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3385  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3386  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3387  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3388 \
3389  template struct MatrixMatrix::AddDetails::AddKernels<SCALAR, LO, GO, NODE>;
3390 
3391 } //End namespace Tpetra
3392 
3393 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Tpetra_Import_Util.hpp
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
TpetraExt_MatrixMatrix_decl.hpp
Tpetra::MatrixMatrix::add
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Definition: TpetraExt_MatrixMatrix_def.hpp:572
Tpetra::StaticProfile
Definition: Tpetra_ConfigDefs.hpp:131
Tpetra::Classes::CrsMatrix::getNodeMaxNumRowEntries
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
Definition: Tpetra_CrsMatrix_def.hpp:785
Tpetra::MatrixMarket::Writer::writeSparseFile
static void writeSparseFile(const std::string &filename, const Teuchos::RCP< const sparse_matrix_type > &pMatrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Definition: MatrixMarket_Tpetra.hpp:5901
Tpetra::Classes::CrsMatrix::getRowMap
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
Definition: Tpetra_CrsMatrix_def.hpp:799
Tpetra::Classes::CrsMatrix::expertStaticFillComplete
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Definition: Tpetra_CrsMatrix_def.hpp:4802
Tpetra::MatrixMatrix::Jacobi
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Definition: TpetraExt_MatrixMatrix_def.hpp:282
Tpetra::Classes::CrsMatrix::isFillActive
bool isFillActive() const
Whether the matrix is not fill complete.
Definition: Tpetra_CrsMatrix_def.hpp:655
Tpetra::Classes::CrsMatrix::scale
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
Definition: Tpetra_CrsMatrix_def.hpp:3667
Tpetra::Classes::CrsMatrix::getGlobalNumRows
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
Definition: Tpetra_CrsMatrix_def.hpp:704
Tpetra::Classes::CrsMatrix::replaceColMap
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
Definition: Tpetra_CrsMatrix_def.hpp:4252
Tpetra::Classes::CrsMatrix::isFillComplete
bool isFillComplete() const override
Whether the matrix is fill complete.
Definition: Tpetra_CrsMatrix_def.hpp:648
Tpetra::Classes::CrsMatrix::isLocallyIndexed
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Definition: Tpetra_CrsMatrix_def.hpp:669
Tpetra::Classes::CrsMatrix::isStaticGraph
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
Definition: Tpetra_CrsMatrix_def.hpp:898
Tpetra::Classes::CrsMatrix::getNodeNumRows
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
Definition: Tpetra_CrsMatrix_def.hpp:718
Tpetra::Classes::CrsMatrix::getRangeMap
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
Definition: Tpetra_CrsMatrix_def.hpp:820
Tpetra::Classes::DistObject< char, LocalOrdinal, GlobalOrdinal, Node >::getMap
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Definition: Tpetra_DistObject_decl.hpp:510
Tpetra::Details::Transfer
Classes::Transfer< LocalOrdinal, GlobalOrdinal, Node > Transfer
Alias for Tpetra::Classes::Details::Transfer.
Definition: Tpetra_Details_Transfer_fwd.hpp:77
Tpetra_Import_Util2.hpp
Utility functions for packing and unpacking sparse matrix entries.
Tpetra::Classes::CrsMatrix::getDomainMap
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Definition: Tpetra_CrsMatrix_def.hpp:813
Tpetra::DynamicProfile
Definition: Tpetra_ConfigDefs.hpp:132
Tpetra::Classes::CrsMatrix
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Definition: Tpetra_CrsMatrix_decl.hpp:424
Tpetra::CrsMatrixStruct
Struct that holds views of the contents of a CrsMatrix.
Definition: TpetraExt_MMHelpers_decl.hpp:68
Tpetra::MatrixMatrix::Multiply
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Definition: TpetraExt_MatrixMatrix_def.hpp:98
Tpetra::Classes::CrsMatrix::setAllValues
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Definition: Tpetra_CrsMatrix_def.hpp:3748
Tpetra::Classes::CrsMatrix::getComm
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Definition: Tpetra_CrsMatrix_def.hpp:627
Tpetra::Classes::RowMatrixTransposer
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Definition: Tpetra_RowMatrixTransposer_decl.hpp:80
Tpetra::Classes::Import
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Definition: Tpetra_Import_decl.hpp:115
TpetraExt_MMHelpers_def.hpp
Tpetra_Util.hpp
Stand-alone utility functions and macros.
Tpetra::Classes::CrsMatrix::getGraph
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
Definition: Tpetra_CrsMatrix_def.hpp:827
MatrixMarket_Tpetra.hpp
Matrix Market file readers and writers for Tpetra objects.
Tpetra::Classes::CrsMatrix::fillComplete
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
Definition: Tpetra_CrsMatrix_def.hpp:4617
Tpetra::Classes::CrsMatrix::local_matrix_type
KokkosSparse::CrsMatrix< impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Definition: Tpetra_CrsMatrix_decl.hpp:483
Tpetra_Details_getEntryOnHost.hpp
Declaration and definition of Tpetra::Details::getEntryOnHost.
Tpetra::Classes::Map
A parallel distribution of indices over processes.
Definition: Tpetra_Map_decl.hpp:247
Tpetra::MatrixMatrix::Add
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Definition: TpetraExt_MatrixMatrix_def.hpp:428
Tpetra::Classes::CrsMatrix::getProfileType
ProfileType getProfileType() const
Returns true if the matrix was allocated with static data structures.
Definition: Tpetra_CrsMatrix_def.hpp:641
Tpetra_Details_computeOffsets.hpp
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
Tpetra::Classes::CrsMatrix::insertGlobalValues
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
Definition: Tpetra_CrsMatrix_def.hpp:2049
Tpetra::Classes::CrsMatrix::sumIntoGlobalValues
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Definition: Tpetra_CrsMatrix_def.hpp:2606
Tpetra::global_size_t
size_t global_size_t
Global size_t object.
Definition: Tpetra_ConfigDefs.hpp:109
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Tpetra::Classes::Export
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Definition: Tpetra_Export_decl.hpp:124
Tpetra::keyValueMerge
void keyValueMerge(KeyInputIterType keyBeg1, KeyInputIterType keyEnd1, ValueInputIterType valBeg1, ValueInputIterType valEnd1, KeyInputIterType keyBeg2, KeyInputIterType keyEnd2, ValueInputIterType valBeg2, ValueInputIterType valEnd2, KeyOutputIterType keyOut, ValueOutputIterType valOut, BinaryFunction f)
Merge two sorted (by keys) sequences of unique (key,value) pairs by combining pairs with equal keys.
Definition: Tpetra_Util.hpp:790
Tpetra::Classes::Vector
A distributed dense vector.
Definition: Tpetra_Vector_decl.hpp:82