42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
48 namespace MatrixMatrix{
50 namespace ExtraKernels{
53 template<
class CrsMatrixType>
54 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
56 size_t Aest = 100, Best=100;
57 if (A.getNodeNumEntries() > 0)
58 Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
59 if (B.getNodeNumEntries() > 0)
60 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
62 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
63 nnzperrow *= nnzperrow;
69 #if defined (HAVE_TPETRA_INST_OPENMP)
71 template<
class Scalar,
74 class LocalOrdinalViewType>
75 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
76 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
77 const LocalOrdinalViewType & targetMapToOrigRow,
78 const LocalOrdinalViewType & targetMapToImportRow,
79 const LocalOrdinalViewType & Bcol2Ccol,
80 const LocalOrdinalViewType & Icol2Ccol,
81 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
82 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
83 const std::string& label,
84 const Teuchos::RCP<Teuchos::ParameterList>& params) {
85 #ifdef HAVE_TPETRA_MMM_TIMINGS
86 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
87 using Teuchos::TimeMonitor;
88 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
92 using Teuchos::ArrayRCP;
93 using Teuchos::ArrayView;
101 typedef typename KCRS::StaticCrsGraphType graph_t;
102 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
103 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
104 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
105 typedef typename KCRS::values_type::non_const_type scalar_view_t;
108 typedef UnmanagedView<lno_view_t> u_lno_view_t;
109 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
110 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
113 typedef LocalOrdinal LO;
114 typedef GlobalOrdinal GO;
115 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
116 typedef Map<LO,GO,NO> map_type;
121 typedef NO::execution_space execution_space;
122 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
125 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
126 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
127 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
130 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
131 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
133 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
134 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
135 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
136 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
138 c_lno_view_t Irowptr;
139 lno_nnz_view_t Icolind;
141 if(!Bview.importMatrix.is_null()) {
142 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
143 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
144 Ivals = Bview.importMatrix->getLocalMatrix().values;
145 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
149 RCP<const map_type> Ccolmap = C.getColMap();
150 size_t m = Aview.origMatrix->getNodeNumRows();
151 size_t n = Ccolmap->getNodeNumElements();
152 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
155 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
156 if(!params.is_null()) {
157 if(params->isParameter(
"openmp: ltg thread max"))
158 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
162 Kokkos::View<u_lno_view_t*> tl_rowptr(
"top_rowptr",thread_max);
163 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind",thread_max);
164 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values",thread_max);
166 double thread_chunk = (double)(m) / thread_max;
169 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
172 size_t my_thread_start = tid * thread_chunk;
173 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
174 size_t my_thread_m = my_thread_stop - my_thread_start;
177 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
180 std::vector<size_t> c_status(n,INVALID);
182 u_lno_view_t Crowptr((
typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m+1)),my_thread_m+1);
183 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
184 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
187 size_t CSR_ip = 0, OLD_ip = 0;
188 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
191 Crowptr(i-my_thread_start) = CSR_ip;
194 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
196 const SC Aval = Avals(k);
200 if (targetMapToOrigRow(Aik) != LO_INVALID) {
207 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
210 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
212 LO Cij = Bcol2Ccol(Bkj);
214 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
216 c_status[Cij] = CSR_ip;
217 Ccolind(CSR_ip) = Cij;
218 Cvals(CSR_ip) = Aval*Bvals(j);
222 Cvals(c_status[Cij]) += Aval*Bvals(j);
233 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
234 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
236 LO Cij = Icol2Ccol(Ikj);
238 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
240 c_status[Cij] = CSR_ip;
241 Ccolind(CSR_ip) = Cij;
242 Cvals(CSR_ip) = Aval*Ivals(j);
246 Cvals(c_status[Cij]) += Aval*Ivals(j);
253 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
255 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
256 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
261 tl_rowptr(tid) = Crowptr;
262 tl_colind(tid) = Ccolind;
263 tl_values(tid) = Cvals;
264 Crowptr(my_thread_m) = CSR_ip;
268 lno_view_t row_mapC(
"non_const_lnow_row", m + 1);
269 lno_nnz_view_t entriesC;
270 scalar_view_t valuesC;
271 copy_out_from_thread_memory(tl_rowptr,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
274 for(
size_t i=0; i<thread_max; i++) {
275 if(tl_rowptr(i).data()) free(tl_rowptr(i).data());
276 if(tl_colind(i).data()) free(tl_colind(i).data());
277 if(tl_values(i).data()) free(tl_values(i).data());
280 #ifdef HAVE_TPETRA_MMM_TIMINGS
281 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
284 if (params.is_null() || params->get(
"sort entries",
true))
285 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
286 C.setAllValues(row_mapC,entriesC,valuesC);
291 template<
class Scalar,
294 class LocalOrdinalViewType>
295 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
296 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
297 const LocalOrdinalViewType & targetMapToOrigRow,
298 const LocalOrdinalViewType & targetMapToImportRow,
299 const LocalOrdinalViewType & Bcol2Ccol,
300 const LocalOrdinalViewType & Icol2Ccol,
301 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
302 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
303 const std::string& label,
304 const Teuchos::RCP<Teuchos::ParameterList>& params) {
305 #ifdef HAVE_TPETRA_MMM_TIMINGS
306 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
307 using Teuchos::TimeMonitor;
308 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
311 using Teuchos::Array;
312 using Teuchos::ArrayRCP;
313 using Teuchos::ArrayView;
320 typedef typename KCRS::StaticCrsGraphType graph_t;
321 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
322 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
323 typedef typename KCRS::values_type::non_const_type scalar_view_t;
326 typedef LocalOrdinal LO;
327 typedef GlobalOrdinal GO;
328 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
329 typedef Map<LO,GO,NO> map_type;
334 typedef NO::execution_space execution_space;
335 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
338 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
339 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
340 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
343 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
344 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
345 const KCRS & Cmat = C.getLocalMatrix();
347 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
348 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
349 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
350 scalar_view_t Cvals = Cmat.values;
352 c_lno_view_t Irowptr;
353 c_lno_nnz_view_t Icolind;
355 if(!Bview.importMatrix.is_null()) {
356 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
357 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
358 Ivals = Bview.importMatrix->getLocalMatrix().values;
362 RCP<const map_type> Ccolmap = C.getColMap();
363 size_t m = Aview.origMatrix->getNodeNumRows();
364 size_t n = Ccolmap->getNodeNumElements();
367 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
368 if(!params.is_null()) {
369 if(params->isParameter(
"openmp: ltg thread max"))
370 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
373 double thread_chunk = (double)(m) / thread_max;
376 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
379 size_t my_thread_start = tid * thread_chunk;
380 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
383 std::vector<size_t> c_status(n,INVALID);
386 size_t CSR_ip = 0, OLD_ip = 0;
387 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
391 CSR_ip = Crowptr(i+1);
392 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
393 c_status[Ccolind(k)] = k;
398 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
400 const SC Aval = Avals(k);
404 if (targetMapToOrigRow(Aik) != LO_INVALID) {
406 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
408 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
410 LO Cij = Bcol2Ccol(Bkj);
412 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
413 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
414 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
416 Cvals(c_status[Cij]) += Aval * Bvals(j);
420 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
421 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
423 LO Cij = Icol2Ccol(Ikj);
425 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
426 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
427 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
429 Cvals(c_status[Cij]) += Aval * Ivals(j);
440 template<
class Scalar,
443 class LocalOrdinalViewType>
444 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
445 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
446 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
447 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
448 const LocalOrdinalViewType & targetMapToOrigRow,
449 const LocalOrdinalViewType & targetMapToImportRow,
450 const LocalOrdinalViewType & Bcol2Ccol,
451 const LocalOrdinalViewType & Icol2Ccol,
452 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
453 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
454 const std::string& label,
455 const Teuchos::RCP<Teuchos::ParameterList>& params) {
456 #ifdef HAVE_TPETRA_MMM_TIMINGS
457 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
458 using Teuchos::TimeMonitor;
459 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
462 using Teuchos::Array;
463 using Teuchos::ArrayRCP;
464 using Teuchos::ArrayView;
469 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
472 typedef typename KCRS::StaticCrsGraphType graph_t;
473 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
474 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
475 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
476 typedef typename KCRS::values_type::non_const_type scalar_view_t;
479 typedef UnmanagedView<lno_view_t> u_lno_view_t;
480 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
481 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
484 typedef typename scalar_view_t::memory_space scalar_memory_space;
487 typedef LocalOrdinal LO;
488 typedef GlobalOrdinal GO;
490 typedef Map<LO,GO,NO> map_type;
495 typedef NO::execution_space execution_space;
496 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
499 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
500 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
501 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
504 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
505 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
507 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
508 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
509 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
510 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
512 c_lno_view_t Irowptr;
513 lno_nnz_view_t Icolind;
515 if(!Bview.importMatrix.is_null()) {
516 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
517 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
518 Ivals = Bview.importMatrix->getLocalMatrix().values;
519 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
523 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
526 RCP<const map_type> Ccolmap = C.getColMap();
527 size_t m = Aview.origMatrix->getNodeNumRows();
528 size_t n = Ccolmap->getNodeNumElements();
529 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
532 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
533 if(!params.is_null()) {
534 if(params->isParameter(
"openmp: ltg thread max"))
535 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
539 Kokkos::View<u_lno_view_t*> tl_rowptr(
"top_rowptr",thread_max);
540 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind",thread_max);
541 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values",thread_max);
543 double thread_chunk = (double)(m) / thread_max;
546 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
549 size_t my_thread_start = tid * thread_chunk;
550 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
551 size_t my_thread_m = my_thread_stop - my_thread_start;
554 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
557 std::vector<size_t> c_status(n,INVALID);
559 u_lno_view_t Crowptr((
typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m+1)),my_thread_m+1);
560 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
561 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
564 size_t CSR_ip = 0, OLD_ip = 0;
565 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
569 Crowptr(i-my_thread_start) = CSR_ip;
571 SC minusOmegaDval = -omega*Dvals(i,0);
574 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
575 const SC Bval = Bvals(j);
579 LO Cij = Bcol2Ccol(Bij);
582 c_status[Cij] = CSR_ip;
583 Ccolind(CSR_ip) = Cij;
584 Cvals(CSR_ip) = Bvals[j];
590 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
592 const SC Aval = Avals(k);
596 if (targetMapToOrigRow(Aik) != LO_INVALID) {
603 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
606 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
608 LO Cij = Bcol2Ccol(Bkj);
610 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
612 c_status[Cij] = CSR_ip;
613 Ccolind(CSR_ip) = Cij;
614 Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
618 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
629 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
630 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
632 LO Cij = Icol2Ccol(Ikj);
634 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
636 c_status[Cij] = CSR_ip;
637 Ccolind(CSR_ip) = Cij;
638 Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
642 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
649 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
651 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
652 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
657 tl_rowptr(tid) = Crowptr;
658 tl_colind(tid) = Ccolind;
659 tl_values(tid) = Cvals;
660 Crowptr(my_thread_m) = CSR_ip;
666 lno_view_t row_mapC(
"non_const_lnow_row", m + 1);
667 lno_nnz_view_t entriesC;
668 scalar_view_t valuesC;
669 copy_out_from_thread_memory(tl_rowptr,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
672 for(
size_t i=0; i<thread_max; i++) {
673 if(tl_rowptr(i).data()) free(tl_rowptr(i).data());
674 if(tl_colind(i).data()) free(tl_colind(i).data());
675 if(tl_values(i).data()) free(tl_values(i).data());
678 #ifdef HAVE_TPETRA_MMM_TIMINGS
679 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
682 if (params.is_null() || params->get(
"sort entries",
true))
683 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
684 C.setAllValues(row_mapC,entriesC,valuesC);
691 template<
class Scalar,
694 class LocalOrdinalViewType>
695 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
696 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
697 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
698 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
699 const LocalOrdinalViewType & targetMapToOrigRow,
700 const LocalOrdinalViewType & targetMapToImportRow,
701 const LocalOrdinalViewType & Bcol2Ccol,
702 const LocalOrdinalViewType & Icol2Ccol,
703 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
704 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
705 const std::string& label,
706 const Teuchos::RCP<Teuchos::ParameterList>& params) {
707 #ifdef HAVE_TPETRA_MMM_TIMINGS
708 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
709 using Teuchos::TimeMonitor;
710 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
712 using Teuchos::Array;
713 using Teuchos::ArrayRCP;
714 using Teuchos::ArrayView;
719 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
722 typedef typename KCRS::StaticCrsGraphType graph_t;
723 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
724 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
725 typedef typename KCRS::values_type::non_const_type scalar_view_t;
728 typedef typename scalar_view_t::memory_space scalar_memory_space;
731 typedef LocalOrdinal LO;
732 typedef GlobalOrdinal GO;
734 typedef Map<LO,GO,NO> map_type;
739 typedef NO::execution_space execution_space;
740 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
743 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
744 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
745 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
748 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
749 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
750 const KCRS & Cmat = C.getLocalMatrix();
752 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
753 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
754 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
755 scalar_view_t Cvals = Cmat.values;
757 c_lno_view_t Irowptr;
758 c_lno_nnz_view_t Icolind;
760 if(!Bview.importMatrix.is_null()) {
761 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
762 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
763 Ivals = Bview.importMatrix->getLocalMatrix().values;
767 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
770 RCP<const map_type> Ccolmap = C.getColMap();
771 size_t m = Aview.origMatrix->getNodeNumRows();
772 size_t n = Ccolmap->getNodeNumElements();
775 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
776 if(!params.is_null()) {
777 if(params->isParameter(
"openmp: ltg thread max"))
778 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
781 double thread_chunk = (double)(m) / thread_max;
784 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
787 size_t my_thread_start = tid * thread_chunk;
788 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
791 std::vector<size_t> c_status(n,INVALID);
794 size_t CSR_ip = 0, OLD_ip = 0;
795 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
799 CSR_ip = Crowptr(i+1);
801 SC minusOmegaDval = -omega*Dvals(i,0);
803 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
804 c_status[Ccolind(k)] = k;
810 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
811 const SC Bval = Bvals(j);
815 LO Cij = Bcol2Ccol(Bij);
818 Cvals(c_status[Cij]) += Bvals(j);
823 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
825 const SC Aval = Avals(k);
829 if (targetMapToOrigRow(Aik) != LO_INVALID) {
831 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
833 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
835 LO Cij = Bcol2Ccol(Bkj);
837 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
838 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
839 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
841 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
845 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
846 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
848 LO Cij = Icol2Ccol(Ikj);
850 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
851 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
852 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
854 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
866 template<
class InRowptrArrayType,
class InColindArrayType,
class InValsArrayType,
867 class OutRowptrType,
class OutColindType,
class OutValsType>
868 void copy_out_from_thread_memory(
const InRowptrArrayType & Inrowptr,
const InColindArrayType &Incolind,
const InValsArrayType & Invalues,
869 size_t m,
double thread_chunk,
870 OutRowptrType & row_mapC, OutColindType &entriesC, OutValsType & valuesC ) {
871 typedef OutRowptrType lno_view_t;
872 typedef OutColindType lno_nnz_view_t;
873 typedef OutValsType scalar_view_t;
874 typedef typename lno_view_t::execution_space execution_space;
875 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
878 size_t thread_max = Inrowptr.size();
880 lno_view_t thread_start_nnz(
"thread_nnz",thread_max+1);
881 Kokkos::parallel_scan(
"LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (
const size_t i,
size_t& update,
const bool final) {
882 size_t mynnz = Inrowptr(i)(Inrowptr(i).extent(0)-1);
883 if(
final) thread_start_nnz(i) = update;
885 if(
final && i+1==thread_max) thread_start_nnz(i+1)=update;
887 c_nnz_size = thread_start_nnz(thread_max);
890 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size); entriesC = entriesC_;
891 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size); valuesC = valuesC_;
894 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid) {
895 size_t my_thread_start = tid * thread_chunk;
896 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
897 size_t nnz_thread_start = thread_start_nnz(tid);
899 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
900 size_t ii = i - my_thread_start;
902 row_mapC(i) = nnz_thread_start + Inrowptr(tid)(ii);
904 row_mapC(m) = nnz_thread_start + Inrowptr(tid)(ii+1);
908 for(
size_t j = Inrowptr(tid)(ii); j<Inrowptr(tid)(ii+1); j++) {
909 entriesC(nnz_thread_start + j) = Incolind(tid)(j);
910 valuesC(nnz_thread_start + j) = Invalues(tid)(j);
920 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
921 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
922 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
923 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
924 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
925 const LocalOrdinalViewType & Acol2Brow,
926 const LocalOrdinalViewType & Acol2Irow,
927 const LocalOrdinalViewType & Bcol2Ccol,
928 const LocalOrdinalViewType & Icol2Ccol,
929 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
930 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
931 const std::string& label,
932 const Teuchos::RCP<Teuchos::ParameterList>& params) {
933 #ifdef HAVE_TPETRA_MMM_TIMINGS
934 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
935 using Teuchos::TimeMonitor;
936 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
937 Teuchos::RCP<Teuchos::TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
939 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
944 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(),0));
945 Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(
" MSAK"),params);
947 #ifdef HAVE_TPETRA_MMM_TIMINGS
948 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
954 #ifdef HAVE_TPETRA_MMM_TIMINGS
955 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
959 Teuchos::ParameterList jparams;
960 if(!params.is_null()) {
962 jparams.set(
"label",label+std::string(
" MSAK Add"));
964 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
965 Tpetra::MatrixMatrix::add(one,
false,*Bview.origMatrix,Scalar(-omega),
false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,
false));