Tpetra parallel linear algebra  Version of the Day
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45 
46 namespace Tpetra {
47 
48 namespace MatrixMatrix{
49 
50 namespace ExtraKernels{
51 
52 
53 template<class CrsMatrixType>
54 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
55  // Follows the NZ estimate in ML's ml_matmatmult.c
56  size_t Aest = 100, Best=100;
57  if (A.getNodeNumEntries() > 0)
58  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
59  if (B.getNodeNumEntries() > 0)
60  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
61 
62  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
63  nnzperrow *= nnzperrow;
64 
65  return nnzperrow;
66 }
67 
68 
69 #if defined (HAVE_TPETRA_INST_OPENMP)
70 /*********************************************************************************************************/
71 template<class Scalar,
72  class LocalOrdinal,
73  class GlobalOrdinal,
74  class LocalOrdinalViewType>
75 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
76  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
77  const LocalOrdinalViewType & targetMapToOrigRow,
78  const LocalOrdinalViewType & targetMapToImportRow,
79  const LocalOrdinalViewType & Bcol2Ccol,
80  const LocalOrdinalViewType & Icol2Ccol,
81  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
82  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
83  const std::string& label,
84  const Teuchos::RCP<Teuchos::ParameterList>& params) {
85 #ifdef HAVE_TPETRA_MMM_TIMINGS
86  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
87  using Teuchos::TimeMonitor;
88  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
89 #endif
90 
91  using Teuchos::Array;
92  using Teuchos::ArrayRCP;
93  using Teuchos::ArrayView;
94  using Teuchos::RCP;
95  using Teuchos::rcp;
96 
97 
98  // Lots and lots of typedefs
100  // typedef typename KCRS::device_type device_t;
101  typedef typename KCRS::StaticCrsGraphType graph_t;
102  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
103  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
104  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
105  typedef typename KCRS::values_type::non_const_type scalar_view_t;
106 
107  // Unmanaged versions of the above
108  typedef UnmanagedView<lno_view_t> u_lno_view_t;
109  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
110  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
111 
112  typedef Scalar SC;
113  typedef LocalOrdinal LO;
114  typedef GlobalOrdinal GO;
115  typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
116  typedef Map<LO,GO,NO> map_type;
117 
118  // NOTE (mfh 15 Sep 2017) This is specifically only for
119  // execution_space = Kokkos::OpenMP, so we neither need nor want
120  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
121  typedef NO::execution_space execution_space;
122  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
123 
124  // All of the invalid guys
125  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
126  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
127  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
128 
129  // Grab the Kokkos::SparseCrsMatrices & inner stuff
130  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
131  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
132 
133  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
134  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
135  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
136  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
137 
138  c_lno_view_t Irowptr;
139  lno_nnz_view_t Icolind;
140  scalar_view_t Ivals;
141  if(!Bview.importMatrix.is_null()) {
142  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
143  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
144  Ivals = Bview.importMatrix->getLocalMatrix().values;
145  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
146  }
147 
148  // Sizes
149  RCP<const map_type> Ccolmap = C.getColMap();
150  size_t m = Aview.origMatrix->getNodeNumRows();
151  size_t n = Ccolmap->getNodeNumElements();
152  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
153 
154  // Get my node / thread info (right from openmp or parameter list)
155  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
156  if(!params.is_null()) {
157  if(params->isParameter("openmp: ltg thread max"))
158  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
159  }
160 
161  // Thread-local memory
162  Kokkos::View<u_lno_view_t*> tl_rowptr("top_rowptr",thread_max);
163  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
164  Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
165 
166  double thread_chunk = (double)(m) / thread_max;
167 
168  // Run chunks of the matrix independently
169  Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
170  {
171  // Thread coordination stuff
172  size_t my_thread_start = tid * thread_chunk;
173  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
174  size_t my_thread_m = my_thread_stop - my_thread_start;
175 
176  // Size estimate
177  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
178 
179  // Allocations
180  std::vector<size_t> c_status(n,INVALID);
181 
182  u_lno_view_t Crowptr((typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m+1)),my_thread_m+1);
183  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
184  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
185 
186  // For each row of A/C
187  size_t CSR_ip = 0, OLD_ip = 0;
188  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
189  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
190  // on the calling process.
191  Crowptr(i-my_thread_start) = CSR_ip;
192 
193  // mfh 27 Sep 2016: For each entry of A in the current row of A
194  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
195  LO Aik = Acolind(k); // local column index of current entry of A
196  const SC Aval = Avals(k); // value of current entry of A
197  if (Aval == SC_ZERO)
198  continue; // skip explicitly stored zero values in A
199 
200  if (targetMapToOrigRow(Aik) != LO_INVALID) {
201  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
202  // corresponding to the current entry of A is populated, then
203  // the corresponding row of B is in B_local (i.e., it lives on
204  // the calling process).
205 
206  // Local matrix
207  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
208 
209  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
210  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
211  LO Bkj = Bcolind(j);
212  LO Cij = Bcol2Ccol(Bkj);
213 
214  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
215  // New entry
216  c_status[Cij] = CSR_ip;
217  Ccolind(CSR_ip) = Cij;
218  Cvals(CSR_ip) = Aval*Bvals(j);
219  CSR_ip++;
220 
221  } else {
222  Cvals(c_status[Cij]) += Aval*Bvals(j);
223  }
224  }
225 
226  } else {
227  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
228  // corresponding to the current entry of A NOT populated (has
229  // a flag "invalid" value), then the corresponding row of B is
230  // in B_local (i.e., it lives on the calling process).
231 
232  // Remote matrix
233  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
234  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
235  LO Ikj = Icolind(j);
236  LO Cij = Icol2Ccol(Ikj);
237 
238  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
239  // New entry
240  c_status[Cij] = CSR_ip;
241  Ccolind(CSR_ip) = Cij;
242  Cvals(CSR_ip) = Aval*Ivals(j);
243  CSR_ip++;
244 
245  } else {
246  Cvals(c_status[Cij]) += Aval*Ivals(j);
247  }
248  }
249  }
250  }
251 
252  // Resize for next pass if needed
253  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
254  CSR_alloc *= 2;
255  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
256  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
257  }
258  OLD_ip = CSR_ip;
259  }
260 
261  tl_rowptr(tid) = Crowptr;
262  tl_colind(tid) = Ccolind;
263  tl_values(tid) = Cvals;
264  Crowptr(my_thread_m) = CSR_ip;
265  });
266 
267  // Do the copy out
268  lno_view_t row_mapC("non_const_lnow_row", m + 1);
269  lno_nnz_view_t entriesC;
270  scalar_view_t valuesC;
271  copy_out_from_thread_memory(tl_rowptr,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
272 
273  //Free the unamanged views
274  for(size_t i=0; i<thread_max; i++) {
275  if(tl_rowptr(i).data()) free(tl_rowptr(i).data());
276  if(tl_colind(i).data()) free(tl_colind(i).data());
277  if(tl_values(i).data()) free(tl_values(i).data());
278  }
279 
280 #ifdef HAVE_TPETRA_MMM_TIMINGS
281  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
282 #endif
283  // Sort & set values
284  if (params.is_null() || params->get("sort entries",true))
285  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
286  C.setAllValues(row_mapC,entriesC,valuesC);
287 
288 }
289 
290 /*********************************************************************************************************/
291 template<class Scalar,
292  class LocalOrdinal,
293  class GlobalOrdinal,
294  class LocalOrdinalViewType>
295 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
296  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
297  const LocalOrdinalViewType & targetMapToOrigRow,
298  const LocalOrdinalViewType & targetMapToImportRow,
299  const LocalOrdinalViewType & Bcol2Ccol,
300  const LocalOrdinalViewType & Icol2Ccol,
301  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
302  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
303  const std::string& label,
304  const Teuchos::RCP<Teuchos::ParameterList>& params) {
305 #ifdef HAVE_TPETRA_MMM_TIMINGS
306  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
307  using Teuchos::TimeMonitor;
308  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
309 #endif
310 
311  using Teuchos::Array;
312  using Teuchos::ArrayRCP;
313  using Teuchos::ArrayView;
314  using Teuchos::RCP;
315  using Teuchos::rcp;
316 
317  // Lots and lots of typedefs
319  // typedef typename KCRS::device_type device_t;
320  typedef typename KCRS::StaticCrsGraphType graph_t;
321  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
322  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
323  typedef typename KCRS::values_type::non_const_type scalar_view_t;
324 
325  typedef Scalar SC;
326  typedef LocalOrdinal LO;
327  typedef GlobalOrdinal GO;
328  typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
329  typedef Map<LO,GO,NO> map_type;
330 
331  // NOTE (mfh 15 Sep 2017) This is specifically only for
332  // execution_space = Kokkos::OpenMP, so we neither need nor want
333  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
334  typedef NO::execution_space execution_space;
335  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
336 
337  // All of the invalid guys
338  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
339  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
340  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
341 
342  // Grab the Kokkos::SparseCrsMatrices & inner stuff
343  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
344  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
345  const KCRS & Cmat = C.getLocalMatrix();
346 
347  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
348  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
349  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
350  scalar_view_t Cvals = Cmat.values;
351 
352  c_lno_view_t Irowptr;
353  c_lno_nnz_view_t Icolind;
354  scalar_view_t Ivals;
355  if(!Bview.importMatrix.is_null()) {
356  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
357  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
358  Ivals = Bview.importMatrix->getLocalMatrix().values;
359  }
360 
361  // Sizes
362  RCP<const map_type> Ccolmap = C.getColMap();
363  size_t m = Aview.origMatrix->getNodeNumRows();
364  size_t n = Ccolmap->getNodeNumElements();
365 
366  // Get my node / thread info (right from openmp or parameter list)
367  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
368  if(!params.is_null()) {
369  if(params->isParameter("openmp: ltg thread max"))
370  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
371  }
372 
373  double thread_chunk = (double)(m) / thread_max;
374 
375  // Run chunks of the matrix independently
376  Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
377  {
378  // Thread coordination stuff
379  size_t my_thread_start = tid * thread_chunk;
380  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
381 
382  // Allocations
383  std::vector<size_t> c_status(n,INVALID);
384 
385  // For each row of A/C
386  size_t CSR_ip = 0, OLD_ip = 0;
387  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
388  // First fill the c_status array w/ locations where we're allowed to
389  // generate nonzeros for this row
390  OLD_ip = Crowptr(i);
391  CSR_ip = Crowptr(i+1);
392  for (size_t k = OLD_ip; k < CSR_ip; k++) {
393  c_status[Ccolind(k)] = k;
394  // Reset values in the row of C
395  Cvals(k) = SC_ZERO;
396  }
397 
398  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
399  LO Aik = Acolind(k);
400  const SC Aval = Avals(k);
401  if (Aval == SC_ZERO)
402  continue;
403 
404  if (targetMapToOrigRow(Aik) != LO_INVALID) {
405  // Local matrix
406  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
407 
408  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
409  LO Bkj = Bcolind(j);
410  LO Cij = Bcol2Ccol(Bkj);
411 
412  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
413  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
414  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
415 
416  Cvals(c_status[Cij]) += Aval * Bvals(j);
417  }
418  } else {
419  // Remote matrix
420  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
421  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
422  LO Ikj = Icolind(j);
423  LO Cij = Icol2Ccol(Ikj);
424 
425  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
426  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
427  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
428 
429  Cvals(c_status[Cij]) += Aval * Ivals(j);
430  }
431  }
432  }
433  }
434  });
435 
436  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
437 }
438 
439 /*********************************************************************************************************/
440 template<class Scalar,
441  class LocalOrdinal,
442  class GlobalOrdinal,
443  class LocalOrdinalViewType>
444 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
445  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
446  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
447  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
448  const LocalOrdinalViewType & targetMapToOrigRow,
449  const LocalOrdinalViewType & targetMapToImportRow,
450  const LocalOrdinalViewType & Bcol2Ccol,
451  const LocalOrdinalViewType & Icol2Ccol,
452  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
453  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
454  const std::string& label,
455  const Teuchos::RCP<Teuchos::ParameterList>& params) {
456 #ifdef HAVE_TPETRA_MMM_TIMINGS
457  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
458  using Teuchos::TimeMonitor;
459  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
460 #endif
461 
462  using Teuchos::Array;
463  using Teuchos::ArrayRCP;
464  using Teuchos::ArrayView;
465  using Teuchos::RCP;
466  using Teuchos::rcp;
467 
468  // Lots and lots of typedefs
469  typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
471  // typedef typename KCRS::device_type device_t;
472  typedef typename KCRS::StaticCrsGraphType graph_t;
473  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
474  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
475  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
476  typedef typename KCRS::values_type::non_const_type scalar_view_t;
477 
478  // Unmanaged versions of the above
479  typedef UnmanagedView<lno_view_t> u_lno_view_t;
480  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
481  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
482 
483  // Jacobi-specific
484  typedef typename scalar_view_t::memory_space scalar_memory_space;
485 
486  typedef Scalar SC;
487  typedef LocalOrdinal LO;
488  typedef GlobalOrdinal GO;
489  typedef Node NO;
490  typedef Map<LO,GO,NO> map_type;
491 
492  // NOTE (mfh 15 Sep 2017) This is specifically only for
493  // execution_space = Kokkos::OpenMP, so we neither need nor want
494  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
495  typedef NO::execution_space execution_space;
496  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
497 
498  // All of the invalid guys
499  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
500  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
501  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
502 
503  // Grab the Kokkos::SparseCrsMatrices & inner stuff
504  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
505  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
506 
507  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
508  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
509  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
510  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
511 
512  c_lno_view_t Irowptr;
513  lno_nnz_view_t Icolind;
514  scalar_view_t Ivals;
515  if(!Bview.importMatrix.is_null()) {
516  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
517  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
518  Ivals = Bview.importMatrix->getLocalMatrix().values;
519  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
520  }
521 
522  // Jacobi-specific inner stuff
523  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
524 
525  // Sizes
526  RCP<const map_type> Ccolmap = C.getColMap();
527  size_t m = Aview.origMatrix->getNodeNumRows();
528  size_t n = Ccolmap->getNodeNumElements();
529  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
530 
531  // Get my node / thread info (right from openmp)
532  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
533  if(!params.is_null()) {
534  if(params->isParameter("openmp: ltg thread max"))
535  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
536  }
537 
538  // Thread-local memory
539  Kokkos::View<u_lno_view_t*> tl_rowptr("top_rowptr",thread_max);
540  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
541  Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
542 
543  double thread_chunk = (double)(m) / thread_max;
544 
545  // Run chunks of the matrix independently
546  Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
547  {
548  // Thread coordination stuff
549  size_t my_thread_start = tid * thread_chunk;
550  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
551  size_t my_thread_m = my_thread_stop - my_thread_start;
552 
553  // Size estimate
554  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
555 
556  // Allocations
557  std::vector<size_t> c_status(n,INVALID);
558 
559  u_lno_view_t Crowptr((typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m+1)),my_thread_m+1);
560  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
561  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
562 
563  // For each row of A/C
564  size_t CSR_ip = 0, OLD_ip = 0;
565  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
566  // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
567  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
568  // on the calling process.
569  Crowptr(i-my_thread_start) = CSR_ip;
570  // NOTE: Vector::getLocalView returns a rank 2 view here
571  SC minusOmegaDval = -omega*Dvals(i,0);
572 
573  // Entries of B
574  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
575  const SC Bval = Bvals(j);
576  if (Bval == SC_ZERO)
577  continue;
578  LO Bij = Bcolind(j);
579  LO Cij = Bcol2Ccol(Bij);
580 
581  // Assume no repeated entries in B
582  c_status[Cij] = CSR_ip;
583  Ccolind(CSR_ip) = Cij;
584  Cvals(CSR_ip) = Bvals[j];
585  CSR_ip++;
586  }
587 
588  // Entries of -omega * Dinv * A * B
589  // mfh 27 Sep 2016: For each entry of A in the current row of A
590  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
591  LO Aik = Acolind(k); // local column index of current entry of A
592  const SC Aval = Avals(k); // value of current entry of A
593  if (Aval == SC_ZERO)
594  continue; // skip explicitly stored zero values in A
595 
596  if (targetMapToOrigRow(Aik) != LO_INVALID) {
597  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
598  // corresponding to the current entry of A is populated, then
599  // the corresponding row of B is in B_local (i.e., it lives on
600  // the calling process).
601 
602  // Local matrix
603  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
604 
605  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
606  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
607  LO Bkj = Bcolind(j);
608  LO Cij = Bcol2Ccol(Bkj);
609 
610  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
611  // New entry
612  c_status[Cij] = CSR_ip;
613  Ccolind(CSR_ip) = Cij;
614  Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
615  CSR_ip++;
616 
617  } else {
618  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
619  }
620  }
621 
622  } else {
623  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
624  // corresponding to the current entry of A NOT populated (has
625  // a flag "invalid" value), then the corresponding row of B is
626  // in B_local (i.e., it lives on the calling process).
627 
628  // Remote matrix
629  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
630  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
631  LO Ikj = Icolind(j);
632  LO Cij = Icol2Ccol(Ikj);
633 
634  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
635  // New entry
636  c_status[Cij] = CSR_ip;
637  Ccolind(CSR_ip) = Cij;
638  Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
639  CSR_ip++;
640 
641  } else {
642  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
643  }
644  }
645  }
646  }
647 
648  // Resize for next pass if needed
649  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
650  CSR_alloc *= 2;
651  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
652  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
653  }
654  OLD_ip = CSR_ip;
655  }
656 
657  tl_rowptr(tid) = Crowptr;
658  tl_colind(tid) = Ccolind;
659  tl_values(tid) = Cvals;
660  Crowptr(my_thread_m) = CSR_ip;
661  });
662 
663 
664 
665  // Do the copy out
666  lno_view_t row_mapC("non_const_lnow_row", m + 1);
667  lno_nnz_view_t entriesC;
668  scalar_view_t valuesC;
669  copy_out_from_thread_memory(tl_rowptr,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
670 
671  //Free the unamanged views
672  for(size_t i=0; i<thread_max; i++) {
673  if(tl_rowptr(i).data()) free(tl_rowptr(i).data());
674  if(tl_colind(i).data()) free(tl_colind(i).data());
675  if(tl_values(i).data()) free(tl_values(i).data());
676  }
677 
678 #ifdef HAVE_TPETRA_MMM_TIMINGS
679  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
680 #endif
681  // Sort & set values
682  if (params.is_null() || params->get("sort entries",true))
683  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
684  C.setAllValues(row_mapC,entriesC,valuesC);
685 
686 }
687 
688 
689 
690 /*********************************************************************************************************/
691 template<class Scalar,
692  class LocalOrdinal,
693  class GlobalOrdinal,
694  class LocalOrdinalViewType>
695 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
696  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
697  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
698  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
699  const LocalOrdinalViewType & targetMapToOrigRow,
700  const LocalOrdinalViewType & targetMapToImportRow,
701  const LocalOrdinalViewType & Bcol2Ccol,
702  const LocalOrdinalViewType & Icol2Ccol,
703  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
704  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
705  const std::string& label,
706  const Teuchos::RCP<Teuchos::ParameterList>& params) {
707 #ifdef HAVE_TPETRA_MMM_TIMINGS
708  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
709  using Teuchos::TimeMonitor;
710  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
711 #endif
712  using Teuchos::Array;
713  using Teuchos::ArrayRCP;
714  using Teuchos::ArrayView;
715  using Teuchos::RCP;
716  using Teuchos::rcp;
717 
718  // Lots and lots of typedefs
719  typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
721  // typedef typename KCRS::device_type device_t;
722  typedef typename KCRS::StaticCrsGraphType graph_t;
723  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
724  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
725  typedef typename KCRS::values_type::non_const_type scalar_view_t;
726 
727  // Jacobi-specific
728  typedef typename scalar_view_t::memory_space scalar_memory_space;
729 
730  typedef Scalar SC;
731  typedef LocalOrdinal LO;
732  typedef GlobalOrdinal GO;
733  typedef Node NO;
734  typedef Map<LO,GO,NO> map_type;
735 
736  // NOTE (mfh 15 Sep 2017) This is specifically only for
737  // execution_space = Kokkos::OpenMP, so we neither need nor want
738  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
739  typedef NO::execution_space execution_space;
740  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
741 
742  // All of the invalid guys
743  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
744  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
745  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
746 
747  // Grab the Kokkos::SparseCrsMatrices & inner stuff
748  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
749  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
750  const KCRS & Cmat = C.getLocalMatrix();
751 
752  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
753  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
754  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
755  scalar_view_t Cvals = Cmat.values;
756 
757  c_lno_view_t Irowptr;
758  c_lno_nnz_view_t Icolind;
759  scalar_view_t Ivals;
760  if(!Bview.importMatrix.is_null()) {
761  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
762  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
763  Ivals = Bview.importMatrix->getLocalMatrix().values;
764  }
765 
766  // Jacobi-specific inner stuff
767  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
768 
769  // Sizes
770  RCP<const map_type> Ccolmap = C.getColMap();
771  size_t m = Aview.origMatrix->getNodeNumRows();
772  size_t n = Ccolmap->getNodeNumElements();
773 
774  // Get my node / thread info (right from openmp or parameter list)
775  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
776  if(!params.is_null()) {
777  if(params->isParameter("openmp: ltg thread max"))
778  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
779  }
780 
781  double thread_chunk = (double)(m) / thread_max;
782 
783  // Run chunks of the matrix independently
784  Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
785  {
786  // Thread coordination stuff
787  size_t my_thread_start = tid * thread_chunk;
788  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
789 
790  // Allocations
791  std::vector<size_t> c_status(n,INVALID);
792 
793  // For each row of A/C
794  size_t CSR_ip = 0, OLD_ip = 0;
795  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
796  // First fill the c_status array w/ locations where we're allowed to
797  // generate nonzeros for this row
798  OLD_ip = Crowptr(i);
799  CSR_ip = Crowptr(i+1);
800  // NOTE: Vector::getLocalView returns a rank 2 view here
801  SC minusOmegaDval = -omega*Dvals(i,0);
802 
803  for (size_t k = OLD_ip; k < CSR_ip; k++) {
804  c_status[Ccolind(k)] = k;
805  // Reset values in the row of C
806  Cvals(k) = SC_ZERO;
807  }
808 
809  // Entries of B
810  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
811  const SC Bval = Bvals(j);
812  if (Bval == SC_ZERO)
813  continue;
814  LO Bij = Bcolind(j);
815  LO Cij = Bcol2Ccol(Bij);
816 
817  // Assume no repeated entries in B
818  Cvals(c_status[Cij]) += Bvals(j);
819  CSR_ip++;
820  }
821 
822 
823  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
824  LO Aik = Acolind(k);
825  const SC Aval = Avals(k);
826  if (Aval == SC_ZERO)
827  continue;
828 
829  if (targetMapToOrigRow(Aik) != LO_INVALID) {
830  // Local matrix
831  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
832 
833  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
834  LO Bkj = Bcolind(j);
835  LO Cij = Bcol2Ccol(Bkj);
836 
837  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
838  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
839  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
840 
841  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
842  }
843  } else {
844  // Remote matrix
845  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
846  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
847  LO Ikj = Icolind(j);
848  LO Cij = Icol2Ccol(Ikj);
849 
850  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
851  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
852  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
853 
854  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
855  }
856  }
857  }
858  }
859  });
860 
861  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
862 }
863 
864 
865 /*********************************************************************************************************/
866 template<class InRowptrArrayType, class InColindArrayType, class InValsArrayType,
867  class OutRowptrType, class OutColindType, class OutValsType>
868 void copy_out_from_thread_memory(const InRowptrArrayType & Inrowptr, const InColindArrayType &Incolind, const InValsArrayType & Invalues,
869  size_t m, double thread_chunk,
870  OutRowptrType & row_mapC, OutColindType &entriesC, OutValsType & valuesC ) {
871  typedef OutRowptrType lno_view_t;
872  typedef OutColindType lno_nnz_view_t;
873  typedef OutValsType scalar_view_t;
874  typedef typename lno_view_t::execution_space execution_space;
875  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
876 
877  // Generate the starting nnz number per thread
878  size_t thread_max = Inrowptr.size();
879  size_t c_nnz_size=0;
880  lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
881  Kokkos::parallel_scan("LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (const size_t i, size_t& update, const bool final) {
882  size_t mynnz = Inrowptr(i)(Inrowptr(i).extent(0)-1);
883  if(final) thread_start_nnz(i) = update;
884  update+=mynnz;
885  if(final && i+1==thread_max) thread_start_nnz(i+1)=update;
886  });
887  c_nnz_size = thread_start_nnz(thread_max);
888 
889  // Allocate output
890  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
891  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
892 
893  // Copy out
894  Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
895  size_t my_thread_start = tid * thread_chunk;
896  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
897  size_t nnz_thread_start = thread_start_nnz(tid);
898 
899  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
900  size_t ii = i - my_thread_start;
901  // Rowptr
902  row_mapC(i) = nnz_thread_start + Inrowptr(tid)(ii);
903  if (i==m-1) {
904  row_mapC(m) = nnz_thread_start + Inrowptr(tid)(ii+1);
905  }
906 
907  // Colind / Values
908  for(size_t j = Inrowptr(tid)(ii); j<Inrowptr(tid)(ii+1); j++) {
909  entriesC(nnz_thread_start + j) = Incolind(tid)(j);
910  valuesC(nnz_thread_start + j) = Invalues(tid)(j);
911  }
912  }
913  });
914 }//end copy_out
915 
916 #endif // OpenMP
917 
918 
919 /*********************************************************************************************************/
920 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
921 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
922  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
923  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
924  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
925  const LocalOrdinalViewType & Acol2Brow,
926  const LocalOrdinalViewType & Acol2Irow,
927  const LocalOrdinalViewType & Bcol2Ccol,
928  const LocalOrdinalViewType & Icol2Ccol,
929  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
930  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
931  const std::string& label,
932  const Teuchos::RCP<Teuchos::ParameterList>& params) {
933 #ifdef HAVE_TPETRA_MMM_TIMINGS
934  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
935  using Teuchos::TimeMonitor;
936  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
937  Teuchos::RCP<Teuchos::TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
938 #endif
939  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
940 
941  // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
942 
943  // 1) Multiply A*B
944  Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(),0));
945  Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(" MSAK"),params);
946 
947 #ifdef HAVE_TPETRA_MMM_TIMINGS
948 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
949 #endif
950 
951  // 2) Scale A by Dinv
952  AB->leftScale(Dinv);
953 
954 #ifdef HAVE_TPETRA_MMM_TIMINGS
955 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
956 #endif
957 
958  // 3) Add [-omega Dinv A] + B
959  Teuchos::ParameterList jparams;
960  if(!params.is_null()) {
961  jparams = *params;
962  jparams.set("label",label+std::string(" MSAK Add"));
963  }
964  Scalar one = Teuchos::ScalarTraits<Scalar>::one();
965  Tpetra::MatrixMatrix::add(one,false,*Bview.origMatrix,Scalar(-omega),false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,false));
966 
967  }// jacobi_A_B_newmatrix_MultiplyScaleAddKernel
968 
969 
970 
971 }//ExtraKernels
972 }//MatrixMatrix
973 }//Tpetra
974 
975 
976 #endif
Tpetra::MatrixMatrix::add
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Definition: TpetraExt_MatrixMatrix_def.hpp:572
Tpetra::Classes::CrsMatrix::local_matrix_type
KokkosSparse::CrsMatrix< impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Definition: Tpetra_CrsMatrix_decl.hpp:483
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.