Tpetra parallel linear algebra  Version of the Day
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
43 
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Tpetra_Util.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
56 #include "Tpetra_Import_Util.hpp"
57 #include "Tpetra_Import_Util2.hpp"
58 #include <algorithm>
59 #include <cmath>
60 #include "Teuchos_FancyOStream.hpp"
61 // #include "KokkosSparse_spgemm.hpp"
62 
63 
69 namespace Tpetra {
70 
71  namespace TripleMatrixMultiply{
72 
73  //
74  // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
75  // op(A) == A if transposeA is false,
76  // op(A) == A^T if transposeA is true,
77  // and similarly for op(R) and op(P).
78  //
79  template <class Scalar,
80  class LocalOrdinal,
81  class GlobalOrdinal,
82  class Node>
85  bool transposeR,
87  bool transposeA,
89  bool transposeP,
91  bool call_FillComplete_on_result,
92  const std::string& label,
93  const Teuchos::RCP<Teuchos::ParameterList>& params)
94  {
95  using Teuchos::null;
96  using Teuchos::RCP;
97  typedef Scalar SC;
98  typedef LocalOrdinal LO;
99  typedef GlobalOrdinal GO;
100  typedef Node NO;
101  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
102  typedef Import<LO,GO,NO> import_type;
103  typedef Export<LO,GO,NO> export_type;
104  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
105  typedef Map<LO,GO,NO> map_type;
106  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
107 
108 #ifdef HAVE_TPETRA_MMM_TIMINGS
109  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
110  using Teuchos::TimeMonitor;
111  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
112 #endif
113 
114  const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
115 
116  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
117 
118  // The input matrices R, A and P must both be fillComplete.
119  TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
120  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
121  TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
122 
123  // If transposeA is true, then Rprime will be the transpose of R
124  // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
125  // will just be a pointer to R.
126  RCP<const crs_matrix_type> Rprime = null;
127  // If transposeA is true, then Aprime will be the transpose of A
128  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
129  // will just be a pointer to A.
130  RCP<const crs_matrix_type> Aprime = null;
131  // If transposeB is true, then Pprime will be the transpose of P
132  // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
133  // will just be a pointer to P.
134  RCP<const crs_matrix_type> Pprime = null;
135 
136  // Is this a "clean" matrix?
137  //
138  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
139  // locally nor globally indexed, then it was empty. I don't like
140  // this, because the most straightforward implementation presumes
141  // lazy allocation of indices. However, historical precedent
142  // demands that we keep around this predicate as a way to test
143  // whether the matrix is empty.
144  const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
145 
146  if (transposeR && &R != &P) {
147  transposer_type transposer(rcpFromRef (R));
148  Rprime = transposer.createTranspose();
149  } else {
150  Rprime = rcpFromRef(R);
151  }
152 
153  if (transposeA) {
154  transposer_type transposer(rcpFromRef (A));
155  Aprime = transposer.createTranspose();
156  } else {
157  Aprime = rcpFromRef(A);
158  }
159 
160  if (transposeP) {
161  transposer_type transposer(rcpFromRef (P));
162  Pprime = transposer.createTranspose();
163  } else {
164  Pprime = rcpFromRef(P);
165  }
166 
167  // Check size compatibility
168  global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
169  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
170  global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
171  global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
172  global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
173  global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
174  global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
175  global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
176  global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
177  TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
178  prefix << "ERROR, inner dimensions of op(R) and op(A) "
179  "must match for matrix-matrix product. op(R) is "
180  << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
181 
182  TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
183  prefix << "ERROR, inner dimensions of op(A) and op(P) "
184  "must match for matrix-matrix product. op(A) is "
185  << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
186 
187  // The result matrix Ac must at least have a row-map that reflects the correct
188  // row-size. Don't check the number of columns because rectangular matrices
189  // which were constructed with only one map can still end up having the
190  // correct capacity and dimensions when filled.
191  TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
192  prefix << "ERROR, dimensions of result Ac must "
193  "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
194  << " rows, should have at least " << Rleft << std::endl);
195 
196  // It doesn't matter whether Ac is already Filled or not. If it is already
197  // Filled, it must have space allocated for the positions that will be
198  // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
199  // we'll error out later when trying to store result values.
200 
201  // CGB: However, matrix must be in active-fill
202  TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
203 
204  // We're going to need to import remotely-owned sections of P if
205  // more than one processor is performing this run, depending on the scenario.
206  int numProcs = P.getComm()->getSize();
207 
208  // Declare a couple of structs that will be used to hold views of the data
209  // of R, A and P, to be used for fast access during the matrix-multiplication.
210  crs_matrix_struct_type Rview;
211  crs_matrix_struct_type Aview;
212  crs_matrix_struct_type Pview;
213 
214  RCP<const map_type> targetMap_R = Rprime->getRowMap();
215  RCP<const map_type> targetMap_A = Aprime->getRowMap();
216  RCP<const map_type> targetMap_P = Pprime->getRowMap();
217 
218 #ifdef HAVE_TPETRA_MMM_TIMINGS
219  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
220 #endif
221 
222  // Now import any needed remote rows and populate the Aview struct
223  // NOTE: We assert that an import isn't needed --- since we do the transpose
224  // above to handle that.
225  RCP<const import_type> dummyImporter;
226 
227  if (!(transposeR && &R == &P))
228  MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
229 
230  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
231 
232  // We will also need local access to all rows of P that correspond to the
233  // column-map of op(A).
234  if (numProcs > 1)
235  targetMap_P = Aprime->getColMap();
236 
237  // Import any needed remote rows and populate the Pview struct.
238  MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
239 
240 
241  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
242 
243  bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
244  if (needs_final_export)
245  Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
246  else
247  Actemp = rcp(&Ac,false);// don't allow deallocation
248 
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
250  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
251 #endif
252 
253  // Call the appropriate method to perform the actual multiplication.
254  if (call_FillComplete_on_result && newFlag) {
255  if (transposeR && &R == &P)
256  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
257  else
258  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
259  } else if (call_FillComplete_on_result) {
260  // MMdetails::mult_A__reuse(Aview, Bview, C, label,params);
261  // Not implemented
262  if (transposeR && &R == &P)
263  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
264  else
265  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
266  } else {
267  // mfh 27 Sep 2016: Is this the "slow" case? This
268  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
269  // thread-parallel inserts, but that may take some effort.
270  // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
271 
272  // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
273 
274  // #ifdef HAVE_TPETRA_MMM_TIMINGS
275  // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
276  // #endif
277  // if (call_FillComplete_on_result) {
278  // // We'll call FillComplete on the C matrix before we exit, and give it a
279  // // domain-map and a range-map.
280  // // The domain-map will be the domain-map of B, unless
281  // // op(B)==transpose(B), in which case the range-map of B will be used.
282  // // The range-map will be the range-map of A, unless op(A)==transpose(A),
283  // // in which case the domain-map of A will be used.
284  // if (!C.isFillComplete())
285  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
286  // }
287  // Not implemented
288  if (transposeR && &R == &P)
289  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
290  else
291  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
292  }
293 
294  if (needs_final_export) {
295 #ifdef HAVE_TPETRA_MMM_TIMINGS
296  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
297 #endif
298  Teuchos::ParameterList labelList;
299  labelList.set("Timer Label", label);
300  RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
301  if(!params.is_null()) labelList.set("compute global constants",params->get("compute global constants",true));
302  export_type exporter = export_type(*Pprime->getGraph()->getImporter());
303  Actemp->exportAndFillComplete(Acprime,
304  exporter,
305  Acprime->getDomainMap(),
306  Acprime->getRangeMap(),
307  rcp(&labelList,false));
308 
309  }
310 #ifdef HAVE_TPETRA_MMM_STATISTICS
311  printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
312 #endif
313 
314  }
315 
316 
317  } //End namespace TripleMatrixMultiply
318 
319  namespace MMdetails{
320 
321 
322  template<class CrsMatrixType>
323  size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
324  size_t nnzPerRowA = 100, Pcols = 100;
325  if (A.getNodeNumEntries() > 0)
326  nnzPerRowA = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 9;
327  if (P.getNodeNumEntries() > 0)
328  Pcols = (P.getNodeNumCols() > 0) ? P.getNodeNumCols() : 100;
329  return (size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
330  }
331 
332 
333 
334  // Kernel method for computing the local portion of Ac = R*A*P
335  template<class Scalar,
336  class LocalOrdinal,
337  class GlobalOrdinal,
338  class Node>
339  void mult_R_A_P_newmatrix(
340  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
341  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
342  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
343  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
344  const std::string& label,
345  const Teuchos::RCP<Teuchos::ParameterList>& params)
346  {
347  using Teuchos::Array;
348  using Teuchos::ArrayRCP;
349  using Teuchos::ArrayView;
350  using Teuchos::RCP;
351  using Teuchos::rcp;
352 
353  //typedef Scalar SC; // unused
354  typedef LocalOrdinal LO;
355  typedef GlobalOrdinal GO;
356  typedef Node NO;
357 
358  typedef Import<LO,GO,NO> import_type;
359  typedef Map<LO,GO,NO> map_type;
360 
361 #ifdef HAVE_TPETRA_MMM_TIMINGS
362  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
363  using Teuchos::TimeMonitor;
364  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
365 #endif
366  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
367 
368  // Build the final importer / column map, hash table lookups for Ac
369  RCP<const import_type> Acimport;
370  RCP<const map_type> Accolmap;
371  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
372  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ?
373  Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
374 
375  // mfh 27 Sep 2016: Pcol2Accol is a table that maps from local column
376  // indices of P, to local column indices of Ac. (P and Ac have the
377  // same number of columns.) The kernel uses this, instead of
378  // copying the entire input matrix P and converting its column
379  // indices to those of Ac.
380  Array<LO> Pcol2Accol(Pview.colMap->getNodeNumElements()), PIcol2Accol;
381 
382  if (Pview.importMatrix.is_null()) {
383  // mfh 27 Sep 2016: P has no "remotes," so P and Ac have the same column Map.
384  Acimport = Pimport;
385  Accolmap = Pview.colMap;
386  // Pcol2Accol is trivial
387  for (size_t i = 0; i < Pview.colMap->getNodeNumElements(); i++)
388  Pcol2Accol[i] = Teuchos::as<LO>(i);
389 
390  } else {
391  // mfh 27 Sep 2016: P has "remotes," so we need to build the
392  // column Map of Ac, as well as Ac's Import object (from its domain
393  // Map to its column Map). Ac's column Map is the union of the
394  // column Maps of (the local part of) P, and the "remote" part of
395  // P. Ditto for the Import. We have optimized this "setUnion"
396  // operation on Import objects and Maps.
397 
398  // Choose the right variant of setUnion
399  if (!Pimport.is_null() && !Iimport.is_null())
400  Acimport = Pimport->setUnion(*Iimport);
401 
402  else if (!Pimport.is_null() && Iimport.is_null())
403  Acimport = Pimport->setUnion();
404 
405  else if (Pimport.is_null() && !Iimport.is_null())
406  Acimport = Iimport->setUnion();
407 
408  else
409  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
410 
411  Accolmap = Acimport->getTargetMap();
412 
413  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
414  // in general. We should get rid of it in order to reduce
415  // communication costs of sparse matrix-matrix multiply.
416  TEUCHOS_TEST_FOR_EXCEPTION(!Acimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
417  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
418 
419  // NOTE: This is not efficient and should be folded into setUnion
420  //
421  // mfh 27 Sep 2016: What the above comment means, is that the
422  // setUnion operation on Import objects could also compute these
423  // local index - to - local index look-up tables.
424  PIcol2Accol.resize(Pview.importMatrix->getColMap()->getNodeNumElements());
425  ArrayView<const GO> Pgid = Pview.origMatrix->getColMap()->getNodeElementList();
426  ArrayView<const GO> Igid = Pview.importMatrix->getColMap()->getNodeElementList();
427 
428  for (size_t i = 0; i < Pview.origMatrix->getColMap()->getNodeNumElements(); i++)
429  Pcol2Accol[i] = Accolmap->getLocalElement(Pgid[i]);
430  for (size_t i = 0; i < Pview.importMatrix->getColMap()->getNodeNumElements(); i++)
431  PIcol2Accol[i] = Accolmap->getLocalElement(Igid[i]);
432  }
433 
434  // Replace the column map
435  //
436  // mfh 27 Sep 2016: We do this because Ac was originally created
437  // without a column Map. Now we have its column Map.
438  Ac.replaceColMap(Accolmap);
439 
440  // mfh 27 Sep 2016: Construct tables that map from local column
441  // indices of A, to local row indices of either P_local (the locally
442  // owned part of P), or P_remote (the "imported" remote part of P).
443  //
444  // For column index Aik in row i of A, if the corresponding row of P
445  // exists in the local part of P ("orig") (which I'll call P_local),
446  // then Acol2PRow[Aik] is the local index of that row of P.
447  // Otherwise, Acol2PRow[Aik] is "invalid" (a flag value).
448  //
449  // For column index Aik in row i of A, if the corresponding row of P
450  // exists in the remote part of P ("Import") (which I'll call
451  // P_remote), then Acol2PRowImport[Aik] is the local index of
452  // that row of B. Otherwise, Acol2PRowImport[Aik] is "invalid"
453  // (a flag value).
454 
455  // Run through all the hash table lookups once and for all
456  Array<LO> Acol2PRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
457  Array<LO> Acol2PRowImport(Aview.colMap->getNodeNumElements(), LO_INVALID);
458 
459  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
460  LO P_LID = Pview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
461  if (P_LID != LO_INVALID) {
462  Acol2PRow[i] = P_LID;
463 
464  } else {
465  LO I_LID = Pview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
466  Acol2PRowImport[i] = I_LID;
467  }
468  }
469 
470  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
471  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
472  KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::
473  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
474  Acol2PRow, Acol2PRowImport, Pcol2Accol, PIcol2Accol,
475  Ac, Acimport, label, params);
476  }
477 
478  /*********************************************************************************************************/
479  template<class InRowptrArrayType, class InColindArrayType, class InValsArrayType,
480  class OutRowptrType, class OutColindType, class OutValsType>
481  void copy_out_from_thread_memory(const InRowptrArrayType & Inrowptr, const InColindArrayType &Incolind, const InValsArrayType & Invalues,
482  size_t m, double thread_chunk,
483  OutRowptrType & row_mapC, OutColindType &entriesC, OutValsType & valuesC ) {
484  typedef OutRowptrType lno_view_t;
485  typedef OutColindType lno_nnz_view_t;
486  typedef OutValsType scalar_view_t;
487  typedef typename lno_view_t::execution_space execution_space;
488  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
489 
490  // Generate the starting nnz number per thread
491  size_t thread_max = Inrowptr.size();
492  size_t c_nnz_size=0;
493  lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
494  {
495  lno_view_t thread_nnz_count("thread_nnz_counts", thread_max);
496  for(size_t i = 0; i < thread_max; i++)
497  thread_nnz_count(i) = Inrowptr(i)(Inrowptr(i).extent(0) - 1);
498  Tpetra::Details::computeOffsetsFromCounts(thread_start_nnz, thread_nnz_count);
499  }
500  c_nnz_size = thread_start_nnz(thread_max);
501 
502  // Allocate output
503  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
504  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
505 
506  // Copy out
507  Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
508  size_t my_thread_start = tid * thread_chunk;
509  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
510  size_t nnz_thread_start = thread_start_nnz(tid);
511 
512  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
513  size_t ii = i - my_thread_start;
514  // Rowptr
515  row_mapC(i) = nnz_thread_start + Inrowptr(tid)(ii);
516  if (i==m-1) {
517  row_mapC(m) = nnz_thread_start + Inrowptr(tid)(ii+1);
518  }
519 
520  // Colind / Values
521  for(size_t j = Inrowptr(tid)(ii); j<Inrowptr(tid)(ii+1); j++) {
522  entriesC(nnz_thread_start + j) = Incolind(tid)(j);
523  valuesC(nnz_thread_start + j) = Invalues(tid)(j);
524  }
525  }
526  });
527  }//end copy_out
528 
529  /*********************************************************************************************************/
530  // RAP NewMatrix Kernel wrappers (Default non-threaded version)
531  // Computes R * A * P -> Ac using classic Gustavson approach
532  template<class Scalar,
533  class LocalOrdinal,
534  class GlobalOrdinal,
535  class Node>
536  void KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
537  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
538  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
539  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
540  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
541  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
542  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
543  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
544  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
545  const std::string& label,
546  const Teuchos::RCP<Teuchos::ParameterList>& params) {
547 #ifdef HAVE_TPETRA_MMM_TIMINGS
548  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
549  using Teuchos::TimeMonitor;
550  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
551 #endif
552 
553  using Teuchos::Array;
554  using Teuchos::ArrayRCP;
555  using Teuchos::ArrayView;
556  using Teuchos::RCP;
557  using Teuchos::rcp;
558 
559  typedef Scalar SC;
560  typedef LocalOrdinal LO;
561  typedef GlobalOrdinal GO;
562  typedef Node NO;
563  typedef Map<LO,GO,NO> map_type;
564  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
565  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
566  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
567 
568  // Sizes
569  RCP<const map_type> Accolmap = Ac.getColMap();
570  size_t m = Rview.origMatrix->getNodeNumRows();
571  size_t n = Accolmap->getNodeNumElements();
572 
573  // Get Data Pointers
574  ArrayRCP<const size_t> Rrowptr_RCP, Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
575  ArrayRCP<size_t> Acrowptr_RCP;
576  ArrayRCP<const LO> Rcolind_RCP, Acolind_RCP, Pcolind_RCP, Icolind_RCP;
577  ArrayRCP<LO> Accolind_RCP;
578  ArrayRCP<const Scalar> Rvals_RCP, Avals_RCP, Pvals_RCP, Ivals_RCP;
579  ArrayRCP<SC> Acvals_RCP;
580 
581  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
582  // out of the CrsMatrix. This code computes R * A * (P_local +
583  // P_remote), where P_local contains the locally owned rows of P,
584  // and P_remote the (previously Import'ed) remote rows of P.
585 
586  Rview.origMatrix->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
587  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
588  Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
589 
590  if (!Pview.importMatrix.is_null())
591  Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
592 
593  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
594  // where Teuchos::ArrayRCP::operator[] may be slower than
595  // Teuchos::ArrayView::operator[].
596 
597  // For efficiency
598  ArrayView<const size_t> Rrowptr, Arowptr, Prowptr, Irowptr;
599  ArrayView<const LO> Rcolind, Acolind, Pcolind, Icolind;
600  ArrayView<const SC> Rvals, Avals, Pvals, Ivals;
601  ArrayView<size_t> Acrowptr;
602  ArrayView<LO> Accolind;
603  ArrayView<SC> Acvals;
604  Rrowptr = Rrowptr_RCP(); Rcolind = Rcolind_RCP(); Rvals = Rvals_RCP();
605  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
606  Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
607  if (!Pview.importMatrix.is_null()) {
608  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
609  }
610 
611  // Classic csr assembly (low memory edition)
612  //
613  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
614  // The method loops over rows of R, and may resize after processing
615  // each row. Chris Siefert says that this reflects experience in
616  // ML; for the non-threaded case, ML found it faster to spend less
617  // effort on estimation and risk an occasional reallocation.
618  size_t nnzAllocated = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
619  Acrowptr_RCP.resize(m+1); Acrowptr = Acrowptr_RCP();
620  Accolind_RCP.resize(nnzAllocated); Accolind = Accolind_RCP();
621  Acvals_RCP.resize(nnzAllocated); Acvals = Acvals_RCP();
622 
623  // mfh 27 Sep 2016: The ac_status array is an implementation detail
624  // of the local sparse matrix-matrix multiply routine.
625 
626  // The status array will contain the index into colind where this entry was last deposited.
627  // ac_status[i] < nnz - not in the row yet
628  // ac_status[i] >= nnz - this is the entry where you can find the data
629  // We start with this filled with INVALID's indicating that there are no entries yet.
630  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
631  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
632  Array<size_t> ac_status(n, ST_INVALID);
633 
634  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
635  // routine. The routine computes Ac := R * A * (P_local + P_remote).
636  //
637  // For column index Aik in row i of A, Acol2PRow[Aik] tells
638  // you whether the corresponding row of P belongs to P_local
639  // ("orig") or P_remote ("Import").
640 
641  // For each row of R
642  size_t nnz = 0, nnz_old = 0;
643  for (size_t i = 0; i < m; i++) {
644  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
645  // on the calling process.
646  Acrowptr[i] = nnz;
647 
648  // mfh 27 Sep 2016: For each entry of R in the current row of R
649  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
650  LO k = Rcolind[kk]; // local column index of current entry of R
651  const SC Rik = Rvals[kk]; // value of current entry of R
652  if (Rik == SC_ZERO)
653  continue; // skip explicitly stored zero values in R
654  // For each entry of A in the current row of A
655  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
656  LO l = Acolind[ll]; // local column index of current entry of A
657  const SC Akl = Avals[ll]; // value of current entry of A
658  if (Akl == SC_ZERO)
659  continue; // skip explicitly stored zero values in A
660 
661 
662  if (Acol2PRow[l] != LO_INVALID) {
663  // mfh 27 Sep 2016: If the entry of Acol2PRow
664  // corresponding to the current entry of A is populated, then
665  // the corresponding row of P is in P_local (i.e., it lives on
666  // the calling process).
667 
668  // Local matrix
669  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
670 
671  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
672  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
673  LO j = Pcolind[jj];
674  LO Acj = Pcol2Accol[j];
675  SC Plj = Pvals[jj];
676 
677  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
678 #ifdef HAVE_TPETRA_DEBUG
679  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
680  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
681  std::runtime_error,
682  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
683 #endif
684  // New entry
685  ac_status[Acj] = nnz;
686  Accolind[nnz] = Acj;
687  Acvals[nnz] = Rik*Akl*Plj;
688  nnz++;
689  } else {
690  Acvals[ac_status[Acj]] += Rik*Akl*Plj;
691  }
692  }
693  } else {
694  // mfh 27 Sep 2016: If the entry of Acol2PRow
695  // corresponding to the current entry of A is NOT populated (has
696  // a flag "invalid" value), then the corresponding row of P is
697  // in P_remote (i.e., it does not live on the calling process).
698 
699  // Remote matrix
700  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
701  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
702  LO j = Icolind[jj];
703  LO Acj = PIcol2Accol[j];
704  SC Plj = Ivals[jj];
705 
706  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
707 #ifdef HAVE_TPETRA_DEBUG
708  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
709  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
710  std::runtime_error,
711  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
712 #endif
713  // New entry
714  ac_status[Acj] = nnz;
715  Accolind[nnz] = Acj;
716  Acvals[nnz] = Rik*Akl*Plj;
717  nnz++;
718  } else {
719  Acvals[ac_status[Acj]] += Rik*Akl*Plj;
720  }
721  }
722  }
723  }
724  }
725  // Resize for next pass if needed
726  if (nnz + n > nnzAllocated) {
727  nnzAllocated *= 2;
728  Accolind_RCP.resize(nnzAllocated); Accolind = Accolind_RCP();
729  Acvals_RCP.resize(nnzAllocated); Acvals = Acvals_RCP();
730  }
731  nnz_old = nnz;
732  }
733 
734  Acrowptr[m] = nnz;
735 
736  // Downward resize
737  Acvals_RCP.resize(nnz);
738  Accolind_RCP.resize(nnz);
739 
740 #ifdef HAVE_TPETRA_MMM_TIMINGS
741  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
742 #endif
743 
744  // Final sort & set of CRS arrays
745  //
746  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
747  // matrix-matrix multiply routine sort the entries for us?
748  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
749  // mfh 27 Sep 2016: This just sets pointers.
750  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
751 
752 #ifdef HAVE_TPETRA_MMM_TIMINGS
753  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
754 #endif
755 
756  // Final FillComplete
757  //
758  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
759  // Import (from domain Map to column Map) construction (which costs
760  // lots of communication) by taking the previously constructed
761  // Import object. We should be able to do this without interfering
762  // with the implementation of the local part of sparse matrix-matrix
763  // multply above.
764  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
765  labelList->set("Timer Label",label);
766  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
767  RCP<const Export<LO,GO,NO> > dummyExport;
768  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
769  Rview.origMatrix->getRangeMap(),
770  Acimport,
771  dummyExport,
772  labelList);
773 
774  }
775 
776 #ifdef HAVE_TPETRA_INST_OPENMP
777  /*********************************************************************************************************/
778  // RAP NewMatrix Kernel wrappers (Threaded LTG version, the OpenMP specialization)
779  // Computes Ac = R * A * P
780  template<class Scalar,
781  class LocalOrdinal,
782  class GlobalOrdinal>
783  struct KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>
784  {
785  static inline void mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
786  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
787  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
788  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
789  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
790  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
791  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
792  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
793  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
794  const std::string& label,
795  const Teuchos::RCP<Teuchos::ParameterList>& params) {
796  using Teuchos::RCP;
797  using Tpetra::MatrixMatrix::UnmanagedView;
798  #ifdef HAVE_TPETRA_MMM_TIMINGS
799  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
800  using Teuchos::TimeMonitor;
801  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
802  #endif
803 
804  typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
805  typedef Scalar SC;
806  typedef LocalOrdinal LO;
807  typedef GlobalOrdinal GO;
808  typedef Node NO;
809  typedef Map<LO,GO,NO> map_type;
811  typedef typename KCRS::StaticCrsGraphType graph_t;
812  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
813  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
814  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
815  typedef typename KCRS::values_type::non_const_type scalar_view_t;
816  typedef typename KCRS::device_type device_t;
817  typedef typename device_t::execution_space execution_space;
818  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
819 
820  // Unmanaged versions of the above
821  typedef UnmanagedView<lno_view_t> u_lno_view_t;
822  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
823  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
824 
825  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
826  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
827 
828  // Sizes
829  RCP<const map_type> Accolmap = Ac.getColMap();
830  size_t m = Rview.origMatrix->getNodeNumRows();
831  size_t n = Accolmap->getNodeNumElements();
832 
833  // Get raw Kokkos matrices, and the raw CSR views
834  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
835  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
836  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
837 
838  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
839  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
840  lno_nnz_view_t Icolind;
841  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
842  scalar_view_t Ivals;
843 
844  if (!Pview.importMatrix.is_null())
845  {
846  const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
847  Irowptr = Imat.graph.row_map;
848  Icolind = Imat.graph.entries;
849  Ivals = Imat.values;
850  }
851 
852  // Classic csr assembly (low memory edition)
853  //
854  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
855  // The method loops over rows of R, and may resize after processing
856  // each row. Chris Siefert says that this reflects experience in
857  // ML; for the non-threaded case, ML found it faster to spend less
858  // effort on estimation and risk an occasional reallocation.
859 
860  size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
861  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
862 
863  // Get my node / thread info (right from openmp or parameter list)
864  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
865  if(!params.is_null()) {
866  if(params->isParameter("openmp: ltg thread max"))
867  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
868  }
869 
870  double thread_chunk = (double)(m) / thread_max;
871 
872  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
873  // routine. The routine computes Ac := R * A * (P_local + P_remote).
874  //
875  // For column index Aik in row i of A, Acol2PRow[Aik] tells
876  // you whether the corresponding row of P belongs to P_local
877  // ("orig") or P_remote ("Import").
878 
879  // Thread-local memory
880  Kokkos::View<u_lno_view_t*> tl_rowptr("top_rowptr", thread_max);
881  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind", thread_max);
882  Kokkos::View<u_scalar_view_t*> tl_values("top_values", thread_max);
883 
884  // For each row of R
885  Kokkos::parallel_for("MMM::RAP::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
886  {
887  // Thread coordination stuff
888  size_t my_thread_start = tid * thread_chunk;
889  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
890  size_t my_thread_m = my_thread_stop - my_thread_start;
891 
892  size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
893 
894  std::vector<size_t> ac_status(n, INVALID);
895 
896  //manually allocate the thread-local storage for Ac
897  u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
898  u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
899  u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
900 
901  //count entries as they are added to Ac
902  size_t nnz = 0, nnz_old = 0;
903  // bmk: loop over the rows of R which are assigned to thread tid
904  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
905  Acrowptr(i - my_thread_start) = nnz;
906  // mfh 27 Sep 2016: For each entry of R in the current row of R
907  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
908  LO k = Rcolind(kk); // local column index of current entry of R
909  const SC Rik = Rvals(kk); // value of current entry of R
910  if (Rik == SC_ZERO)
911  continue; // skip explicitly stored zero values in R
912  // For each entry of A in the current row of A
913  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
914  LO l = Acolind(ll); // local column index of current entry of A
915  const SC Akl = Avals(ll); // value of current entry of A
916  if (Akl == SC_ZERO)
917  continue; // skip explicitly stored zero values in A
918 
919  if (Acol2PRow[l] != LO_INVALID) {
920  // mfh 27 Sep 2016: If the entry of Acol2PRow
921  // corresponding to the current entry of A is populated, then
922  // the corresponding row of P is in P_local (i.e., it lives on
923  // the calling process).
924 
925  // Local matrix
926  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
927 
928  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
929  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
930  LO j = Pcolind(jj);
931  LO Acj = Pcol2Accol[j];
932  SC Plj = Pvals(jj);
933 
934  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
935  #ifdef HAVE_TPETRA_DEBUG
936  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
937  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
938  std::runtime_error,
939  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
940  #endif
941  // New entry
942  ac_status[Acj] = nnz;
943  Accolind(nnz) = Acj;
944  Acvals(nnz) = Rik*Akl*Plj;
945  nnz++;
946  } else {
947  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
948  }
949  }
950  } else {
951  // mfh 27 Sep 2016: If the entry of Acol2PRow
952  // corresponding to the current entry of A is NOT populated (has
953  // a flag "invalid" value), then the corresponding row of P is
954  // in P_remote (i.e., it does not live on the calling process).
955 
956  // Remote matrix
957  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
958  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
959  LO j = Icolind(jj);
960  LO Acj = PIcol2Accol[j];
961  SC Plj = Ivals(jj);
962 
963  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
964  #ifdef HAVE_TPETRA_DEBUG
965  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
966  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
967  std::runtime_error,
968  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
969  #endif
970  // New entry
971  ac_status[Acj] = nnz;
972  Accolind(nnz) = Acj;
973  Acvals(nnz) = Rik*Akl*Plj;
974  nnz++;
975  } else {
976  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
977  }
978  }
979  }
980  }
981  }
982  // Resize for next pass if needed
983  if (nnz + n > nnzAllocated) {
984  nnzAllocated *= 2;
985  Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
986  Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
987  }
988  nnz_old = nnz;
989  }
990  Acrowptr(my_thread_m) = nnz;
991  tl_rowptr(tid) = Acrowptr;
992  tl_colind(tid) = Accolind;
993  tl_values(tid) = Acvals;
994  });
995  #ifdef HAVE_TPETRA_MMM_TIMINGS
996  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
997  #endif
998 
999  lno_view_t rowmapAc("non_const_lnow_row", m + 1);
1000  lno_nnz_view_t entriesAc;
1001  scalar_view_t valuesAc;
1002  copy_out_from_thread_memory(tl_rowptr, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1003 
1004  for(size_t i=0; i<thread_max; i++) {
1005  if(tl_rowptr(i).data()) free(tl_rowptr(i).data());
1006  if(tl_colind(i).data()) free(tl_colind(i).data());
1007  if(tl_values(i).data()) free(tl_values(i).data());
1008  }
1009 
1010  #ifdef HAVE_TPETRA_MMM_TIMINGS
1011  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1012  #endif
1013 
1014  // Final sort & set of CRS arrays
1015  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1016  // mfh 27 Sep 2016: This just sets pointers.
1017  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1018 
1019  #ifdef HAVE_TPETRA_MMM_TIMINGS
1020  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1021  #endif
1022 
1023  // Final FillComplete
1024  //
1025  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1026  // Import (from domain Map to column Map) construction (which costs
1027  // lots of communication) by taking the previously constructed
1028  // Import object. We should be able to do this without interfering
1029  // with the implementation of the local part of sparse matrix-matrix
1030  // multply above.
1031  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1032  labelList->set("Timer Label",label);
1033  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1034  RCP<const Export<LO,GO,NO> > dummyExport;
1035  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1036  Rview.origMatrix->getRangeMap(),
1037  Acimport,
1038  dummyExport,
1039  labelList);
1040 
1041  }
1042 
1043  /*********************************************************************************************************/
1044  // PT_A_P NewMatrix Kernel wrappers (LTG OpenMP specialization)
1045  // Computes P.T * A * P -> Ac
1046  static inline void mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1047  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1048  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1049  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1050  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1051  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1052  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1053  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1054  const std::string& label,
1055  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1056  #ifdef HAVE_TPETRA_MMM_TIMINGS
1057  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1058  using Teuchos::TimeMonitor;
1059  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix OpenMP"))));
1060  #endif
1061 
1062  using Teuchos::RCP;
1063  using Tpetra::MatrixMatrix::UnmanagedView;
1064 
1065  typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1066  typedef Scalar SC;
1067  typedef LocalOrdinal LO;
1068  typedef GlobalOrdinal GO;
1069  typedef Node NO;
1070  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1072  typedef typename KCRS::StaticCrsGraphType graph_t;
1073  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1074  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1075  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1076  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1077  typedef typename KCRS::device_type device_t;
1078  typedef typename device_t::execution_space execution_space;
1079  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1080 
1081  // Unmanaged versions of the above
1082  typedef UnmanagedView<lno_view_t> u_lno_view_t;
1083  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1084  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1085 
1086  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1087  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1088 
1089  // number of rows on the process of the fine matrix
1090  // size_t m = Pview.origMatrix->getNodeNumRows();
1091  // number of rows on the process of the coarse matrix
1092  size_t n = Ac.getRowMap()->getNodeNumElements();
1093  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1094 
1095  #ifdef HAVE_TPETRA_MMM_TIMINGS
1096  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1097  #endif
1098 
1100  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1101  RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1102  if (!params.is_null())
1103  transposeParams->set("compute global constants",
1104  params->get("compute global constants: temporaries",
1105  false));
1106  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1107 
1108  // Get raw Kokkos matrices, and the raw CSR views
1109  const KCRS & Rmat = Ptrans->getLocalMatrix();
1110  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1111  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1112 
1113  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
1114  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
1115  lno_nnz_view_t Icolind;
1116  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1117  scalar_view_t Ivals;
1118 
1119  if (!Pview.importMatrix.is_null())
1120  {
1121  const KCRS & Imat = Pview.importMatrix->getLocalMatrix();
1122  Irowptr = Imat.graph.row_map;
1123  Icolind = Imat.graph.entries;
1124  Ivals = Imat.values;
1125  }
1126 
1127  #ifdef HAVE_TPETRA_MMM_TIMINGS
1128  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Alloc"))));
1129  #endif
1130 
1131  size_t Acest_total_nnz = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix),
1132  Ac.getColMap()->getNodeNumElements());
1133  size_t Acest_nnz_per_row = std::ceil(Acest_total_nnz / n);
1134  size_t nnzPerRowA = 100;
1135  if (Aview.origMatrix->getNodeNumEntries() > 0)
1136  nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1137 
1138  // Classic csr assembly (low memory edition)
1139  //
1140  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1141  // The method loops over rows of R, and may resize after processing
1142  // each row. Chris Siefert says that this reflects experience in
1143  // ML; for the non-threaded case, ML found it faster to spend less
1144  // effort on estimation and risk an occasional reallocation.
1145 
1146  const size_t ST_INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1147 
1148  // Get my node / thread info (right from openmp or parameter list)
1149  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1150  if(!params.is_null()) {
1151  if(params->isParameter("openmp: ltg thread max"))
1152  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1153  }
1154 
1155  double thread_chunk = (double)(n) / thread_max;
1156 
1157 
1158  #ifdef HAVE_TPETRA_MMM_TIMINGS
1159  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1160  #endif
1161 
1162  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1163  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1164  //
1165  // For column index Aik in row i of A, Acol2PRow[Aik] tells
1166  // you whether the corresponding row of P belongs to P_local
1167  // ("orig") or P_remote ("Import").
1168 
1169  // Thread-local memory
1170  Kokkos::View<u_lno_view_t*> tl_rowptr("top_rowptr", thread_max);
1171  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind", thread_max);
1172  Kokkos::View<u_scalar_view_t*> tl_values("top_values", thread_max);
1173 
1174  Kokkos::parallel_for("MMM::PTAP::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1175  {
1176  // Thread coordination stuff
1177  size_t my_thread_start = tid * thread_chunk;
1178  size_t my_thread_stop = tid == thread_max-1 ? n : (tid+1)*thread_chunk;
1179  size_t my_thread_n = my_thread_stop - my_thread_start;
1180 
1181  size_t nnzAllocated = (size_t) (my_thread_n * Acest_nnz_per_row + 100);
1182 
1183  std::vector<size_t> ac_status(maxAccol + 1, ST_INVALID);
1184 
1185  //manually allocate the thread-local storage for Ac
1186  u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_n+1)), my_thread_n + 1);
1187  u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1188  u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1189  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1190  // on the calling process.
1191  size_t nnz = 0, nnz_old = 0;
1192  // bmk: loop over the rows of R which are assigned to thread tid
1193  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1194  Acrowptr(i - my_thread_start) = nnz;
1195  // mfh 27 Sep 2016: For each entry of R in the current row of R
1196  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1197  LO k = Rcolind(kk); // local column index of current entry of R
1198  const SC Rik = Rvals(kk); // value of current entry of R
1199  if (Rik == SC_ZERO)
1200  continue; // skip explicitly stored zero values in R
1201  // For each entry of A in the current row of A
1202  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1203  LO l = Acolind(ll); // local column index of current entry of A
1204  const SC Akl = Avals(ll); // value of current entry of A
1205  if (Akl == SC_ZERO)
1206  continue; // skip explicitly stored zero values in A
1207 
1208  if (Acol2PRow[l] != LO_INVALID) {
1209  // mfh 27 Sep 2016: If the entry of Acol2PRow
1210  // corresponding to the current entry of A is populated, then
1211  // the corresponding row of P is in P_local (i.e., it lives on
1212  // the calling process).
1213 
1214  // Local matrix
1215  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1216 
1217  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1218  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1219  LO j = Pcolind(jj);
1220  SC Plj = Pvals(jj);
1221  if (Plj == SC_ZERO)
1222  continue;
1223  LO Acj = Pcol2Accol[j];
1224 
1225  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1226  #ifdef HAVE_TPETRA_DEBUG
1227  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1228  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1229  std::runtime_error,
1230  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.extent(0) << std::endl);
1231  #endif
1232  // New entry
1233  ac_status[Acj] = nnz;
1234  Accolind(nnz) = Acj;
1235  Acvals(nnz) = Rik*Akl*Plj;
1236  nnz++;
1237  } else {
1238  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1239  }
1240  }
1241  } else {
1242  // mfh 27 Sep 2016: If the entry of Acol2PRow
1243  // corresponding to the current entry of A is NOT populated (has
1244  // a flag "invalid" value), then the corresponding row of P is
1245  // in P_reote (i.e., it does not live on the calling process).
1246 
1247  // Remote matrix
1248  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1249  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1250  LO j = Icolind(jj);
1251  SC Plj = Ivals(jj);
1252  if (Plj == SC_ZERO)
1253  continue;
1254  LO Acj = PIcol2Accol[j];
1255 
1256  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1257  #ifdef HAVE_TPETRA_DEBUG
1258  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1259  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1260  std::runtime_error,
1261  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1262  #endif
1263  // New entry
1264  ac_status[Acj] = nnz;
1265  Accolind(nnz) = Acj;
1266  Acvals(nnz) = Rik*Akl*Plj;
1267  nnz++;
1268  } else {
1269  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1270  }
1271  }
1272  }
1273  }
1274  }
1275  // Resize for next pass if needed
1276  if (nnz + std::max(5*nnzPerRowA, n) > nnzAllocated) {
1277  nnzAllocated *= 2;
1278  nnzAllocated = std::max(nnzAllocated, nnz + std::max(5*nnzPerRowA, n));
1279  Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1280  Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1281  }
1282  nnz_old = nnz;
1283  }
1284  tl_rowptr(tid) = Acrowptr;
1285  tl_colind(tid) = Accolind;
1286  tl_values(tid) = Acvals;
1287  Acrowptr(my_thread_n) = nnz;
1288  });
1289 
1290  #ifdef HAVE_TPETRA_MMM_TIMINGS
1291  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP copy from thread local"))));
1292  #endif
1293  lno_view_t rowmapAc("non_const_lnow_row", n + 1);
1294  lno_nnz_view_t entriesAc;
1295  scalar_view_t valuesAc;
1296  copy_out_from_thread_memory(tl_rowptr, tl_colind, tl_values, n, thread_chunk, rowmapAc, entriesAc, valuesAc);
1297 
1298  #ifdef HAVE_TPETRA_MMM_TIMINGS
1299  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1300  #endif
1301 
1302  // Final sort & set of CRS arrays
1303  //
1304  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1305  // matrix-matrix multiply routine sort the entries for us?
1306  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1307 
1308  // mfh 27 Sep 2016: This just sets pointers.
1309  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1310 
1311  #ifdef HAVE_TPETRA_MMM_TIMINGS
1312  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1313  #endif
1314 
1315  // Final FillComplete
1316  //
1317  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1318  // Import (from domain Map to column Map) construction (which costs
1319  // lots of communication) by taking the previously constructed
1320  // Import object. We should be able to do this without interfering
1321  // with the implementation of the local part of sparse matrix-matrix
1322  // multply above.
1323  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1324  labelList->set("Timer Label",label);
1325  // labelList->set("Sort column Map ghost GIDs")
1326  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1327  RCP<const Export<LO,GO,NO> > dummyExport;
1328  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1329  Pview.origMatrix->getDomainMap(),
1330  Acimport,
1331  dummyExport, labelList);
1332  }
1333  };
1334 #endif //OpenMP
1335 
1336 
1337  // Kernel method for computing the local portion of Ac = PT*A*P
1338  template<class Scalar,
1339  class LocalOrdinal,
1340  class GlobalOrdinal,
1341  class Node>
1342  void mult_PT_A_P_newmatrix(
1343  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1344  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1345  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1346  const std::string& label,
1347  const Teuchos::RCP<Teuchos::ParameterList>& params)
1348  {
1349  using Teuchos::Array;
1350  using Teuchos::ArrayRCP;
1351  using Teuchos::ArrayView;
1352  using Teuchos::RCP;
1353  using Teuchos::rcp;
1354 
1355  //typedef Scalar SC; // unused
1356  typedef LocalOrdinal LO;
1357  typedef GlobalOrdinal GO;
1358  typedef Node NO;
1359 
1360  typedef Import<LO,GO,NO> import_type;
1361  typedef Map<LO,GO,NO> map_type;
1362 
1363 #ifdef HAVE_TPETRA_MMM_TIMINGS
1364  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1365  using Teuchos::TimeMonitor;
1366  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP")))));
1367 #endif
1368 
1369  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1370 
1371  // Build the final importer / column map, hash table lookups for Ac
1372  RCP<const import_type> Acimport;
1373  RCP<const map_type> Accolmap;
1374  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
1375  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ?
1376  Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
1377 
1378  // mfh 27 Sep 2016: Pcol2Accol is a table that maps from local column
1379  // indices of P, to local column indices of Ac. (P and Ac have the
1380  // same number of columns.) The kernel uses this, instead of
1381  // copying the entire input matrix P and converting its column
1382  // indices to those of Ac.
1383  Array<LO> Pcol2Accol(Pview.colMap->getNodeNumElements()), PIcol2Accol;
1384 
1385  if (Pview.importMatrix.is_null()) {
1386  // mfh 27 Sep 2016: P has no "remotes," so P and Ac have the same column Map.
1387  Acimport = Pimport;
1388  Accolmap = Pview.colMap;
1389  // Pcol2Accol is trivial
1390  for (size_t i = 0; i < Pview.colMap->getNodeNumElements(); i++)
1391  Pcol2Accol[i] = Teuchos::as<LO>(i);
1392 
1393  } else {
1394  // mfh 27 Sep 2016: P has "remotes," so we need to build the
1395  // column Map of Ac, as well as Ac's Import object (from its domain
1396  // Map to its column Map). Ac's column Map is the union of the
1397  // column Maps of (the local part of) P, and the "remote" part of
1398  // P. Ditto for the Import. We have optimized this "setUnion"
1399  // operation on Import objects and Maps.
1400 
1401  // Choose the right variant of setUnion
1402  if (!Pimport.is_null() && !Iimport.is_null())
1403  Acimport = Pimport->setUnion(*Iimport);
1404 
1405  else if (!Pimport.is_null() && Iimport.is_null())
1406  Acimport = Pimport->setUnion();
1407 
1408  else if (Pimport.is_null() && !Iimport.is_null())
1409  Acimport = Iimport->setUnion();
1410 
1411  else
1412  throw std::runtime_error("TpetraExt::PTAP status of matrix importers is nonsensical");
1413 
1414  Accolmap = Acimport->getTargetMap();
1415 
1416  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1417  // in general. We should get rid of it in order to reduce
1418  // communication costs of sparse matrix-matrix multiply.
1419  TEUCHOS_TEST_FOR_EXCEPTION(!Acimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
1420  std::runtime_error, "Tpetra::PTAP: Import setUnion messed with the DomainMap in an unfortunate way");
1421 
1422  // NOTE: This is not efficient and should be folded into setUnion
1423  //
1424  // mfh 27 Sep 2016: What the above comment means, is that the
1425  // setUnion operation on Import objects could also compute these
1426  // local index - to - local index look-up tables.
1427  PIcol2Accol.resize(Pview.importMatrix->getColMap()->getNodeNumElements());
1428  ArrayView<const GO> Pgid = Pview.origMatrix->getColMap()->getNodeElementList();
1429  ArrayView<const GO> Igid = Pview.importMatrix->getColMap()->getNodeElementList();
1430 
1431  for (size_t i = 0; i < Pview.origMatrix->getColMap()->getNodeNumElements(); i++)
1432  Pcol2Accol[i] = Accolmap->getLocalElement(Pgid[i]);
1433 
1434  for (size_t i = 0; i < Pview.importMatrix->getColMap()->getNodeNumElements(); i++)
1435  PIcol2Accol[i] = Accolmap->getLocalElement(Igid[i]);
1436 
1437  }
1438 
1439  // Replace the column map
1440  //
1441  // mfh 27 Sep 2016: We do this because Ac was originally created
1442  // without a column Map. Now we have its column Map.
1443  Ac.replaceColMap(Accolmap);
1444 
1445  // mfh 27 Sep 2016: Construct tables that map from local column
1446  // indices of A, to local row indices of either P_local (the locally
1447  // owned part of P), or P_remote (the "imported" remote part of P).
1448  //
1449  // For column index Aik in row i of A, if the corresponding row of P
1450  // exists in the local part of P ("orig") (which I'll call P_local),
1451  // then Acol2PRow[Aik] is the local index of that row of P.
1452  // Otherwise, Acol2PRow[Aik] is "invalid" (a flag value).
1453  //
1454  // For column index Aik in row i of A, if the corresponding row of P
1455  // exists in the remote part of P ("Import") (which I'll call
1456  // P_remote), then Acol2PRowImport[Aik] is the local index of
1457  // that row of P. Otherwise, Acol2PRowImport[Aik] is "invalid"
1458  // (a flag value).
1459 
1460  // Run through all the hash table lookups once and for all
1461 
1462  // These two are needed for the product A*P
1463  Array<LO> Acol2PRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
1464  Array<LO> Acol2PRowImport(Aview.colMap->getNodeNumElements(), LO_INVALID);
1465  for (LO A_LID = Aview.colMap->getMinLocalIndex(); A_LID <= Aview.colMap->getMaxLocalIndex(); A_LID++) {
1466  GO A_GID = Aview.colMap->getGlobalElement(A_LID);
1467  LO P_LID = Pview.origMatrix->getRowMap()->getLocalElement(A_GID);
1468  if (P_LID != LO_INVALID) {
1469  Acol2PRow[A_LID] = P_LID;
1470  } else {
1471  LO I_LID = Pview.importMatrix->getRowMap()->getLocalElement(A_GID);
1472  Acol2PRowImport[A_LID] = I_LID;
1473  }
1474  }
1475 
1476  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1477  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1478  KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper(Aview,
1479  Pview,
1480  Acol2PRow,
1481  Acol2PRowImport,
1482  Pcol2Accol,
1483  PIcol2Accol,
1484  Ac,
1485  Acimport,
1486  label,
1487  params);
1488 
1489  }
1490 
1491 
1492  /*********************************************************************************************************/
1493  // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1494  // Computes P.T * A * P -> Ac
1495  template<class Scalar,
1496  class LocalOrdinal,
1497  class GlobalOrdinal,
1498  class Node>
1499  void KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1500  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1501  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1502  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1503  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1504  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1505  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1506  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1507  const std::string& label,
1508  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1509 #ifdef HAVE_TPETRA_MMM_TIMINGS
1510  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1511  using Teuchos::TimeMonitor;
1512  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1513 #endif
1514 
1515  using Teuchos::Array;
1516  using Teuchos::ArrayRCP;
1517  using Teuchos::ArrayView;
1518  using Teuchos::RCP;
1519  using Teuchos::rcp;
1520 
1521  typedef Scalar SC;
1522  typedef LocalOrdinal LO;
1523  typedef GlobalOrdinal GO;
1524  typedef Node NO;
1525  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1526  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1527  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1528 
1529  // number of rows on the process of the fine matrix
1530  // size_t m = Pview.origMatrix->getNodeNumRows();
1531  // number of rows on the process of the coarse matrix
1532  size_t n = Ac.getRowMap()->getNodeNumElements();
1533  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1534 
1535  // Get Data Pointers
1536  ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1537  ArrayRCP<size_t> Acrowptr_RCP;
1538  ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1539  ArrayRCP<LO> Accolind_RCP;
1540  ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1541  ArrayRCP<SC> Acvals_RCP;
1542 
1543  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1544  // out of the CrsMatrix. This code computes R * A * (P_local +
1545  // P_remote), where P_local contains the locally owned rows of P,
1546  // and P_remote the (previously Import'ed) remote rows of P.
1547 
1548  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1549  Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1550 
1551  if (!Pview.importMatrix.is_null())
1552  Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1553 
1554  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1555  // where Teuchos::ArrayRCP::operator[] may be slower than
1556  // Teuchos::ArrayView::operator[].
1557 
1558  // For efficiency
1559  ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1560  ArrayView<const LO> Acolind, Pcolind, Icolind;
1561  ArrayView<const SC> Avals, Pvals, Ivals;
1562  ArrayView<size_t> Acrowptr;
1563  ArrayView<LO> Accolind;
1564  ArrayView<SC> Acvals;
1565  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1566  Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1567  if (!Pview.importMatrix.is_null()) {
1568  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1569  }
1570 
1571 #ifdef HAVE_TPETRA_MMM_TIMINGS
1572  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1573 #endif
1574 
1576  // Get the local transpose of the graph of P by locally transposing
1577  // all of P
1578 
1579  ArrayRCP<const size_t> Rrowptr_RCP;
1580  ArrayRCP<const LO> Rcolind_RCP;
1581  ArrayRCP<const Scalar> Rvals_RCP;
1582  ArrayView<const size_t> Rrowptr;
1583  ArrayView<const LO> Rcolind;
1584  ArrayView<const SC> Rvals;
1585 
1586  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1587  RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1588  if (!params.is_null())
1589  transposeParams->set("compute global constants",
1590  params->get("compute global constants: temporaries",
1591  false));
1592  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1593 
1594  Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1595  Rrowptr = Rrowptr_RCP();
1596  Rcolind = Rcolind_RCP();
1597  Rvals = Rvals_RCP();
1598 
1599 #ifdef HAVE_TPETRA_MMM_TIMINGS
1600  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Alloc"))));
1601 #endif
1602 
1603  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix),
1604  Ac.getColMap()->getNodeNumElements());
1605  size_t nnzPerRowA = 100;
1606  if (Aview.origMatrix->getNodeNumEntries() > 0)
1607  nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1608  Acrowptr_RCP.resize(n+1); Acrowptr = Acrowptr_RCP();
1609  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1610  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1611 
1612  // Classic csr assembly (low memory edition)
1613  //
1614  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1615  // The method loops over rows of R, and may resize after processing
1616  // each row. Chris Siefert says that this reflects experience in
1617  // ML; for the non-threaded case, ML found it faster to spend less
1618  // effort on estimation and risk an occasional reallocation.
1619 
1620  // mfh 27 Sep 2016: The ac_status array is an implementation detail
1621  // of the local sparse matrix-matrix multiply routine.
1622 
1623  // The status array will contain the index into colind where this entry was last deposited.
1624  // ac_status[i] < nnz - not in the row yet
1625  // ac_status[i] >= nnz - this is the entry where you can find the data
1626  // We start with this filled with INVALID's indicating that there are no entries yet.
1627  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1628  const size_t ST_INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1629  Array<size_t> ac_status(maxAccol+1, ST_INVALID);
1630 
1631 
1632 #ifdef HAVE_TPETRA_MMM_TIMINGS
1633  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1634 #endif
1635 
1636  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1637  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1638  //
1639  // For column index Aik in row i of A, Acol2PRow[Aik] tells
1640  // you whether the corresponding row of P belongs to P_local
1641  // ("orig") or P_remote ("Import").
1642 
1643  size_t nnz = 0, nnz_old = 0;
1644 
1645  // For each row of R
1646  for (size_t i = 0; i < n; i++) {
1647  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1648  // on the calling process.
1649  Acrowptr[i] = nnz;
1650 
1651  // mfh 27 Sep 2016: For each entry of R in the current row of R
1652  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1653  LO k = Rcolind[kk]; // local column index of current entry of R
1654  const SC Rik = Rvals[kk]; // value of current entry of R
1655  if (Rik == SC_ZERO)
1656  continue; // skip explicitly stored zero values in R
1657  // For each entry of A in the current row of A
1658  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1659  LO l = Acolind[ll]; // local column index of current entry of A
1660  const SC Akl = Avals[ll]; // value of current entry of A
1661  if (Akl == SC_ZERO)
1662  continue; // skip explicitly stored zero values in A
1663 
1664  if (Acol2PRow[l] != LO_INVALID) {
1665  // mfh 27 Sep 2016: If the entry of Acol2PRow
1666  // corresponding to the current entry of A is populated, then
1667  // the corresponding row of P is in P_local (i.e., it lives on
1668  // the calling process).
1669 
1670  // Local matrix
1671  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1672 
1673  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1674  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1675  LO j = Pcolind[jj];
1676  SC Plj = Pvals[jj];
1677  if (Plj == SC_ZERO)
1678  continue;
1679  LO Acj = Pcol2Accol[j];
1680 
1681  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1682 #ifdef HAVE_TPETRA_DEBUG
1683  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1684  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1685  std::runtime_error,
1686  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1687 #endif
1688  // New entry
1689  ac_status[Acj] = nnz;
1690  Accolind[nnz] = Acj;
1691  Acvals[nnz] = Rik*Akl*Plj;
1692  nnz++;
1693  } else {
1694  Acvals[ac_status[Acj]] += Rik*Akl*Plj;
1695  }
1696  }
1697  } else {
1698  // mfh 27 Sep 2016: If the entry of Acol2PRow
1699  // corresponding to the current entry of A is NOT populated (has
1700  // a flag "invalid" value), then the corresponding row of P is
1701  // in P_reote (i.e., it does not live on the calling process).
1702 
1703  // Remote matrix
1704  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1705  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1706  LO j = Icolind[jj];
1707  SC Plj = Ivals[jj];
1708  if (Plj == SC_ZERO)
1709  continue;
1710  LO Acj = PIcol2Accol[j];
1711 
1712  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1713 #ifdef HAVE_TPETRA_DEBUG
1714  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1715  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1716  std::runtime_error,
1717  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1718 #endif
1719  // New entry
1720  ac_status[Acj] = nnz;
1721  Accolind[nnz] = Acj;
1722  Acvals[nnz] = Rik*Akl*Plj;
1723  nnz++;
1724  } else {
1725  Acvals[ac_status[Acj]] += Rik*Akl*Plj;
1726  }
1727  }
1728  }
1729  }
1730  }
1731  // Resize for next pass if needed
1732  if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1733  nnz_alloc *= 2;
1734  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1735  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1736  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1737  }
1738  nnz_old = nnz;
1739  }
1740 
1741  Acrowptr[n] = nnz;
1742 
1743  // Downward resize
1744  Acvals_RCP.resize(nnz);
1745  Accolind_RCP.resize(nnz);
1746 
1747 #ifdef HAVE_TPETRA_MMM_TIMINGS
1748  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1749 #endif
1750 
1751  // Final sort & set of CRS arrays
1752  //
1753  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1754  // matrix-matrix multiply routine sort the entries for us?
1755  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1756 
1757  // mfh 27 Sep 2016: This just sets pointers.
1758  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1759 
1760 #ifdef HAVE_TPETRA_MMM_TIMINGS
1761  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1762 #endif
1763 
1764  // Final FillComplete
1765  //
1766  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1767  // Import (from domain Map to column Map) construction (which costs
1768  // lots of communication) by taking the previously constructed
1769  // Import object. We should be able to do this without interfering
1770  // with the implementation of the local part of sparse matrix-matrix
1771  // multply above.
1772  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1773  labelList->set("Timer Label",label);
1774  // labelList->set("Sort column Map ghost GIDs")
1775  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1776  RCP<const Export<LO,GO,NO> > dummyExport;
1777  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1778  Pview.origMatrix->getDomainMap(),
1779  Acimport,
1780  dummyExport, labelList);
1781  }
1782 
1783 
1784  /*********************************************************************************************************/
1785  // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1786  // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1787  // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1788  // Currently, this implementation never gets called.
1789  template<class Scalar,
1790  class LocalOrdinal,
1791  class GlobalOrdinal,
1792  class Node>
1793  void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1794  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1795  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1796  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1797  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1798  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1799  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1800  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1801  const std::string& label,
1802  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1803 #ifdef HAVE_TPETRA_MMM_TIMINGS
1804  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1805  using Teuchos::TimeMonitor;
1806  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1807 #endif
1808 
1809  using Teuchos::Array;
1810  using Teuchos::ArrayRCP;
1811  using Teuchos::ArrayView;
1812  using Teuchos::RCP;
1813  using Teuchos::rcp;
1814 
1815  typedef Scalar SC;
1816  typedef LocalOrdinal LO;
1817  typedef GlobalOrdinal GO;
1818  typedef Node NO;
1819  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1820  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1821  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1822 
1823  // number of rows on the process of the fine matrix
1824  // size_t m = Pview.origMatrix->getNodeNumRows();
1825  // number of rows on the process of the coarse matrix
1826  size_t n = Ac.getRowMap()->getNodeNumElements();
1827  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1828 
1829  // Get Data Pointers
1830  ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1831  ArrayRCP<size_t> Acrowptr_RCP;
1832  ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1833  ArrayRCP<LO> Accolind_RCP;
1834  ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1835  ArrayRCP<SC> Acvals_RCP;
1836 
1837  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1838  // out of the CrsMatrix. This code computes R * A * (P_local +
1839  // P_remote), where P_local contains the locally owned rows of P,
1840  // and P_remote the (previously Import'ed) remote rows of P.
1841 
1842  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1843  Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1844 
1845  if (!Pview.importMatrix.is_null())
1846  Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1847 
1848  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1849  // where Teuchos::ArrayRCP::operator[] may be slower than
1850  // Teuchos::ArrayView::operator[].
1851 
1852  // For efficiency
1853  ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1854  ArrayView<const LO> Acolind, Pcolind, Icolind;
1855  ArrayView<const SC> Avals, Pvals, Ivals;
1856  ArrayView<size_t> Acrowptr;
1857  ArrayView<LO> Accolind;
1858  ArrayView<SC> Acvals;
1859  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1860  Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1861  if (!Pview.importMatrix.is_null()) {
1862  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1863  }
1864 
1866  // In a first pass, determine the graph of Ac.
1868 
1870  // Get the graph of Ac. This gets the local transpose of P,
1871  // then loops over R, A, P to get the graph of Ac.
1873 
1874 #ifdef HAVE_TPETRA_MMM_TIMINGS
1875  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1876 #endif
1877 
1879  // Get the local transpose of the graph of P by locally transposing
1880  // all of P
1881 
1882  ArrayRCP<const size_t> Rrowptr_RCP;
1883  ArrayRCP<const LO> Rcolind_RCP;
1884  ArrayRCP<const Scalar> Rvals_RCP;
1885  ArrayView<const size_t> Rrowptr;
1886  ArrayView<const LO> Rcolind;
1887  ArrayView<const SC> Rvals;
1888 
1889  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1890  RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1891  if (!params.is_null())
1892  transposeParams->set("compute global constants",
1893  params->get("compute global constants: temporaries",
1894  false));
1895  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1896 
1897  Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1898  Rrowptr = Rrowptr_RCP();
1899  Rcolind = Rcolind_RCP();
1900  Rvals = Rvals_RCP();
1901 
1903  // Construct graph
1904 
1905  #ifdef HAVE_TPETRA_MMM_TIMINGS
1906  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1907  #endif
1908 
1909  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1910  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1911 
1912  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1913  size_t nnzPerRowA = 100;
1914  if (Aview.origMatrix->getNodeNumEntries() > 0)
1915  nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1916  Acrowptr_RCP.resize(n+1);
1917  Acrowptr = Acrowptr_RCP();
1918  Accolind_RCP.resize(nnz_alloc);
1919  Accolind = Accolind_RCP();
1920 
1921  size_t nnz = 0, nnz_old = 0;
1922  for (size_t i = 0; i < n; i++) {
1923  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1924  // on the calling process.
1925  Acrowptr[i] = nnz;
1926 
1927  // mfh 27 Sep 2016: For each entry of R in the current row of R
1928  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1929  LO k = Rcolind[kk]; // local column index of current entry of R
1930  // For each entry of A in the current row of A
1931  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1932  LO l = Acolind[ll]; // local column index of current entry of A
1933 
1934  if (Acol2PRow[l] != LO_INVALID) {
1935  // mfh 27 Sep 2016: If the entry of Acol2PRow
1936  // corresponding to the current entry of A is populated, then
1937  // the corresponding row of P is in P_local (i.e., it lives on
1938  // the calling process).
1939 
1940  // Local matrix
1941  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1942 
1943  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1944  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1945  LO j = Pcolind[jj];
1946  LO Acj = Pcol2Accol[j];
1947 
1948  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1949  // New entry
1950  ac_status[Acj] = nnz;
1951  Accolind[nnz] = Acj;
1952  nnz++;
1953  }
1954  }
1955  } else {
1956  // mfh 27 Sep 2016: If the entry of Acol2PRow
1957  // corresponding to the current entry of A is NOT populated (has
1958  // a flag "invalid" value), then the corresponding row of P is
1959  // in P_remote (i.e., it does not live on the calling process).
1960 
1961  // Remote matrix
1962  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1963  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1964  LO j = Icolind[jj];
1965  LO Acj = PIcol2Accol[j];
1966 
1967  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1968  // New entry
1969  ac_status[Acj] = nnz;
1970  Accolind[nnz] = Acj;
1971  nnz++;
1972  }
1973  }
1974  }
1975  }
1976  }
1977  // Resize for next pass if needed
1978  // cag: Maybe we can do something more subtle here, and not double
1979  // the size right away.
1980  if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1981  nnz_alloc *= 2;
1982  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1983  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1984  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1985  }
1986  nnz_old = nnz;
1987  }
1988  Acrowptr[n] = nnz;
1989 
1990  // Downward resize
1991  Accolind_RCP.resize(nnz);
1992  Accolind = Accolind_RCP();
1993 
1994  // Allocate Acvals
1995  Acvals_RCP.resize(nnz, SC_ZERO);
1996  Acvals = Acvals_RCP();
1997 
1998 
2000  // In a second pass, enter the values into Acvals.
2002 
2003  #ifdef HAVE_TPETRA_MMM_TIMINGS
2004  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
2005  #endif
2006 
2007 
2008  for (size_t k = 0; k < n; k++) {
2009  for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
2010  LO i = Pcolind[ii];
2011  const SC Pki = Pvals[ii];
2012  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
2013  LO l = Acolind[ll];
2014  const SC Akl = Avals[ll];
2015  if (Akl == 0.)
2016  continue;
2017  if (Acol2PRow[l] != LO_INVALID) {
2018  // mfh 27 Sep 2016: If the entry of Acol2PRow
2019  // corresponding to the current entry of A is populated, then
2020  // the corresponding row of P is in P_local (i.e., it lives on
2021  // the calling process).
2022 
2023  // Local matrix
2024  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
2025  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
2026  LO j = Pcolind[jj];
2027  LO Acj = Pcol2Accol[j];
2028  size_t pp;
2029  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
2030  if (Accolind[pp] == Acj)
2031  break;
2032  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
2033  // std::runtime_error, "problem with Ac column indices");
2034  Acvals[pp] += Pki*Akl*Pvals[jj];
2035  }
2036  } else {
2037  // mfh 27 Sep 2016: If the entry of Acol2PRow
2038  // corresponding to the current entry of A NOT populated (has
2039  // a flag "invalid" value), then the corresponding row of P is
2040  // in P_remote (i.e., it does not live on the calling process).
2041 
2042  // Remote matrix
2043  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
2044  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
2045  LO j = Icolind[jj];
2046  LO Acj = PIcol2Accol[j];
2047  size_t pp;
2048  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
2049  if (Accolind[pp] == Acj)
2050  break;
2051  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
2052  // std::runtime_error, "problem with Ac column indices");
2053  Acvals[pp] += Pki*Akl*Ivals[jj];
2054  }
2055  }
2056  }
2057  }
2058  }
2059 
2060 
2061 #ifdef HAVE_TPETRA_MMM_TIMINGS
2062  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
2063 #endif
2064 
2065  // Final sort & set of CRS arrays
2066  //
2067  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2068  // matrix-matrix multiply routine sort the entries for us?
2069  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
2070 
2071  // mfh 27 Sep 2016: This just sets pointers.
2072  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
2073 
2074 #ifdef HAVE_TPETRA_MMM_TIMINGS
2075  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
2076 #endif
2077 
2078  // Final FillComplete
2079  //
2080  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2081  // Import (from domain Map to column Map) construction (which costs
2082  // lots of communication) by taking the previously constructed
2083  // Import object. We should be able to do this without interfering
2084  // with the implementation of the local part of sparse matrix-matrix
2085  // multply above.
2086  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2087  labelList->set("Timer Label",label);
2088  // labelList->set("Sort column Map ghost GIDs")
2089  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2090  RCP<const Export<LO,GO,NO> > dummyExport;
2091  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
2092  Pview.origMatrix->getDomainMap(),
2093  Acimport,
2094  dummyExport, labelList);
2095  }
2096 
2097 
2098 
2099  } //End namepsace MMdetails
2100 
2101 } //End namespace Tpetra
2102 //
2103 // Explicit instantiation macro
2104 //
2105 // Must be expanded from within the Tpetra namespace!
2106 //
2107 
2108 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
2109  \
2110  template \
2111  void TripleMatrixMultiply::MultiplyRAP( \
2112  const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
2113  bool transposeR, \
2114  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2115  bool transposeA, \
2116  const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
2117  bool transposeP, \
2118  CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
2119  bool call_FillComplete_on_result, \
2120  const std::string & label, \
2121  const Teuchos::RCP<Teuchos::ParameterList>& params); \
2122  \
2123 
2124 
2125 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Tpetra_Import_Util.hpp
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Tpetra::Details::computeOffsetsFromCounts
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Definition: Tpetra_Details_computeOffsets.hpp:284
TpetraExt_MatrixMatrix_def.hpp
Tpetra::Classes::CrsMatrix::isFillActive
bool isFillActive() const
Whether the matrix is not fill complete.
Definition: Tpetra_CrsMatrix_def.hpp:655
Tpetra::Classes::CrsMatrix::getGlobalNumRows
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
Definition: Tpetra_CrsMatrix_def.hpp:704
Tpetra::Classes::CrsMatrix::isFillComplete
bool isFillComplete() const override
Whether the matrix is fill complete.
Definition: Tpetra_CrsMatrix_def.hpp:648
Tpetra::TripleMatrixMultiply::MultiplyRAP
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Definition: TpetraExt_TripleMatrixMultiply_def.hpp:83
Tpetra_Import_Util2.hpp
Utility functions for packing and unpacking sparse matrix entries.
Tpetra::Classes::CrsMatrix::getDomainMap
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Definition: Tpetra_CrsMatrix_def.hpp:813
Tpetra::Classes::CrsMatrix
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Definition: Tpetra_CrsMatrix_decl.hpp:424
Tpetra::CrsMatrixStruct
Struct that holds views of the contents of a CrsMatrix.
Definition: TpetraExt_MMHelpers_decl.hpp:68
Tpetra::Classes::CrsMatrix::getComm
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Definition: Tpetra_CrsMatrix_def.hpp:627
Tpetra::Classes::RowMatrixTransposer
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Definition: Tpetra_RowMatrixTransposer_decl.hpp:80
Tpetra::Classes::Import
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Definition: Tpetra_Import_decl.hpp:115
TpetraExt_MMHelpers_def.hpp
Tpetra_Util.hpp
Stand-alone utility functions and macros.
Tpetra::Classes::CrsMatrix::getGraph
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
Definition: Tpetra_CrsMatrix_def.hpp:827
Tpetra::Classes::CrsMatrix::local_matrix_type
KokkosSparse::CrsMatrix< impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Definition: Tpetra_CrsMatrix_decl.hpp:483
Tpetra::Classes::Map
A parallel distribution of indices over processes.
Definition: Tpetra_Map_decl.hpp:247
Tpetra::global_size_t
size_t global_size_t
Global size_t object.
Definition: Tpetra_ConfigDefs.hpp:109
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Tpetra::Classes::Export
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Definition: Tpetra_Export_decl.hpp:124