41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
60 #include "Teuchos_FancyOStream.hpp"
71 namespace TripleMatrixMultiply{
79 template <
class Scalar,
91 bool call_FillComplete_on_result,
92 const std::string& label,
93 const Teuchos::RCP<Teuchos::ParameterList>& params)
98 typedef LocalOrdinal LO;
99 typedef GlobalOrdinal GO;
108 #ifdef HAVE_TPETRA_MMM_TIMINGS
109 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
110 using Teuchos::TimeMonitor;
111 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Setup"))));
114 const std::string prefix =
"TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
119 TEUCHOS_TEST_FOR_EXCEPTION(!R.
isFillComplete(), std::runtime_error, prefix <<
"Matrix R is not fill complete.");
120 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
121 TEUCHOS_TEST_FOR_EXCEPTION(!P.
isFillComplete(), std::runtime_error, prefix <<
"Matrix P is not fill complete.");
126 RCP<const crs_matrix_type> Rprime =
null;
130 RCP<const crs_matrix_type> Aprime =
null;
134 RCP<const crs_matrix_type> Pprime =
null;
144 const bool newFlag = !Ac.
getGraph()->isLocallyIndexed() && !Ac.
getGraph()->isGloballyIndexed();
146 if (transposeR && &R != &P) {
147 transposer_type transposer(rcpFromRef (R));
148 Rprime = transposer.createTranspose();
150 Rprime = rcpFromRef(R);
154 transposer_type transposer(rcpFromRef (A));
155 Aprime = transposer.createTranspose();
157 Aprime = rcpFromRef(A);
161 transposer_type transposer(rcpFromRef (P));
162 Pprime = transposer.createTranspose();
164 Pprime = rcpFromRef(P);
177 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
178 prefix <<
"ERROR, inner dimensions of op(R) and op(A) "
179 "must match for matrix-matrix product. op(R) is "
180 << Rleft <<
"x" << Rright <<
", op(A) is "<< Aleft <<
"x" << Aright);
182 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
183 prefix <<
"ERROR, inner dimensions of op(A) and op(P) "
184 "must match for matrix-matrix product. op(A) is "
185 << Aleft <<
"x" << Aright <<
", op(P) is "<< Pleft <<
"x" << Pright);
191 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.
getGlobalNumRows(), std::runtime_error,
192 prefix <<
"ERROR, dimensions of result Ac must "
193 "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.
getGlobalNumRows()
194 <<
" rows, should have at least " << Rleft << std::endl);
206 int numProcs = P.
getComm()->getSize();
210 crs_matrix_struct_type Rview;
211 crs_matrix_struct_type Aview;
212 crs_matrix_struct_type Pview;
214 RCP<const map_type> targetMap_R = Rprime->getRowMap();
215 RCP<const map_type> targetMap_A = Aprime->getRowMap();
216 RCP<const map_type> targetMap_P = Pprime->getRowMap();
218 #ifdef HAVE_TPETRA_MMM_TIMINGS
219 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All I&X"))));
225 RCP<const import_type> dummyImporter;
227 if (!(transposeR && &R == &P))
228 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter,
true, label, params);
230 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
235 targetMap_P = Aprime->getColMap();
238 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(),
false, label, params);
241 RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
243 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
244 if (needs_final_export)
247 Actemp = rcp(&Ac,
false);
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
250 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Multiply"))));
254 if (call_FillComplete_on_result && newFlag) {
255 if (transposeR && &R == &P)
256 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
258 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
259 }
else if (call_FillComplete_on_result) {
262 if (transposeR && &R == &P)
263 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
265 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
288 if (transposeR && &R == &P)
289 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
291 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
294 if (needs_final_export) {
295 #ifdef HAVE_TPETRA_MMM_TIMINGS
296 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP exportAndFillComplete"))));
298 Teuchos::ParameterList labelList;
299 labelList.set(
"Timer Label", label);
300 RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
301 if(!params.is_null()) labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
302 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
303 Actemp->exportAndFillComplete(Acprime,
305 Acprime->getDomainMap(),
306 Acprime->getRangeMap(),
307 rcp(&labelList,
false));
310 #ifdef HAVE_TPETRA_MMM_STATISTICS
311 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(
" RAP MMM"));
322 template<
class CrsMatrixType>
323 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
324 size_t nnzPerRowA = 100, Pcols = 100;
325 if (A.getNodeNumEntries() > 0)
326 nnzPerRowA = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 9;
327 if (P.getNodeNumEntries() > 0)
328 Pcols = (P.getNodeNumCols() > 0) ? P.getNodeNumCols() : 100;
329 return (
size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
335 template<
class Scalar,
339 void mult_R_A_P_newmatrix(
340 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
341 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
342 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
343 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
344 const std::string& label,
345 const Teuchos::RCP<Teuchos::ParameterList>& params)
347 using Teuchos::Array;
348 using Teuchos::ArrayRCP;
349 using Teuchos::ArrayView;
354 typedef LocalOrdinal LO;
355 typedef GlobalOrdinal GO;
358 typedef Import<LO,GO,NO> import_type;
359 typedef Map<LO,GO,NO> map_type;
361 #ifdef HAVE_TPETRA_MMM_TIMINGS
362 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
363 using Teuchos::TimeMonitor;
364 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
366 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
369 RCP<const import_type> Acimport;
370 RCP<const map_type> Accolmap;
371 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
372 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ?
373 Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
380 Array<LO> Pcol2Accol(Pview.colMap->getNodeNumElements()), PIcol2Accol;
382 if (Pview.importMatrix.is_null()) {
385 Accolmap = Pview.colMap;
387 for (
size_t i = 0; i < Pview.colMap->getNodeNumElements(); i++)
388 Pcol2Accol[i] = Teuchos::as<LO>(i);
399 if (!Pimport.is_null() && !Iimport.is_null())
400 Acimport = Pimport->setUnion(*Iimport);
402 else if (!Pimport.is_null() && Iimport.is_null())
403 Acimport = Pimport->setUnion();
405 else if (Pimport.is_null() && !Iimport.is_null())
406 Acimport = Iimport->setUnion();
409 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
411 Accolmap = Acimport->getTargetMap();
416 TEUCHOS_TEST_FOR_EXCEPTION(!Acimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
417 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
424 PIcol2Accol.resize(Pview.importMatrix->getColMap()->getNodeNumElements());
425 ArrayView<const GO> Pgid = Pview.origMatrix->getColMap()->getNodeElementList();
426 ArrayView<const GO> Igid = Pview.importMatrix->getColMap()->getNodeElementList();
428 for (
size_t i = 0; i < Pview.origMatrix->getColMap()->getNodeNumElements(); i++)
429 Pcol2Accol[i] = Accolmap->getLocalElement(Pgid[i]);
430 for (
size_t i = 0; i < Pview.importMatrix->getColMap()->getNodeNumElements(); i++)
431 PIcol2Accol[i] = Accolmap->getLocalElement(Igid[i]);
438 Ac.replaceColMap(Accolmap);
456 Array<LO> Acol2PRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
457 Array<LO> Acol2PRowImport(Aview.colMap->getNodeNumElements(), LO_INVALID);
459 for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
460 LO P_LID = Pview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
461 if (P_LID != LO_INVALID) {
462 Acol2PRow[i] = P_LID;
465 LO I_LID = Pview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
466 Acol2PRowImport[i] = I_LID;
472 KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::
473 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
474 Acol2PRow, Acol2PRowImport, Pcol2Accol, PIcol2Accol,
475 Ac, Acimport, label, params);
479 template<
class InRowptrArrayType,
class InColindArrayType,
class InValsArrayType,
480 class OutRowptrType,
class OutColindType,
class OutValsType>
481 void copy_out_from_thread_memory(
const InRowptrArrayType & Inrowptr,
const InColindArrayType &Incolind,
const InValsArrayType & Invalues,
482 size_t m,
double thread_chunk,
483 OutRowptrType & row_mapC, OutColindType &entriesC, OutValsType & valuesC ) {
484 typedef OutRowptrType lno_view_t;
485 typedef OutColindType lno_nnz_view_t;
486 typedef OutValsType scalar_view_t;
487 typedef typename lno_view_t::execution_space execution_space;
488 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
491 size_t thread_max = Inrowptr.size();
493 lno_view_t thread_start_nnz(
"thread_nnz",thread_max+1);
495 lno_view_t thread_nnz_count(
"thread_nnz_counts", thread_max);
496 for(
size_t i = 0; i < thread_max; i++)
497 thread_nnz_count(i) = Inrowptr(i)(Inrowptr(i).extent(0) - 1);
500 c_nnz_size = thread_start_nnz(thread_max);
503 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size); entriesC = entriesC_;
504 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size); valuesC = valuesC_;
507 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid) {
508 size_t my_thread_start = tid * thread_chunk;
509 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
510 size_t nnz_thread_start = thread_start_nnz(tid);
512 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
513 size_t ii = i - my_thread_start;
515 row_mapC(i) = nnz_thread_start + Inrowptr(tid)(ii);
517 row_mapC(m) = nnz_thread_start + Inrowptr(tid)(ii+1);
521 for(
size_t j = Inrowptr(tid)(ii); j<Inrowptr(tid)(ii+1); j++) {
522 entriesC(nnz_thread_start + j) = Incolind(tid)(j);
523 valuesC(nnz_thread_start + j) = Invalues(tid)(j);
532 template<
class Scalar,
536 void KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
537 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
538 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
539 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
540 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
541 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
542 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
543 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
544 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
545 const std::string& label,
546 const Teuchos::RCP<Teuchos::ParameterList>& params) {
547 #ifdef HAVE_TPETRA_MMM_TIMINGS
548 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
549 using Teuchos::TimeMonitor;
550 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
553 using Teuchos::Array;
554 using Teuchos::ArrayRCP;
555 using Teuchos::ArrayView;
560 typedef LocalOrdinal LO;
561 typedef GlobalOrdinal GO;
563 typedef Map<LO,GO,NO> map_type;
564 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
565 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
566 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
569 RCP<const map_type> Accolmap = Ac.getColMap();
570 size_t m = Rview.origMatrix->getNodeNumRows();
571 size_t n = Accolmap->getNodeNumElements();
574 ArrayRCP<const size_t> Rrowptr_RCP, Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
575 ArrayRCP<size_t> Acrowptr_RCP;
576 ArrayRCP<const LO> Rcolind_RCP, Acolind_RCP, Pcolind_RCP, Icolind_RCP;
577 ArrayRCP<LO> Accolind_RCP;
578 ArrayRCP<const Scalar> Rvals_RCP, Avals_RCP, Pvals_RCP, Ivals_RCP;
579 ArrayRCP<SC> Acvals_RCP;
586 Rview.origMatrix->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
587 Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
588 Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
590 if (!Pview.importMatrix.is_null())
591 Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
598 ArrayView<const size_t> Rrowptr, Arowptr, Prowptr, Irowptr;
599 ArrayView<const LO> Rcolind, Acolind, Pcolind, Icolind;
600 ArrayView<const SC> Rvals, Avals, Pvals, Ivals;
601 ArrayView<size_t> Acrowptr;
602 ArrayView<LO> Accolind;
603 ArrayView<SC> Acvals;
604 Rrowptr = Rrowptr_RCP(); Rcolind = Rcolind_RCP(); Rvals = Rvals_RCP();
605 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
606 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
607 if (!Pview.importMatrix.is_null()) {
608 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
618 size_t nnzAllocated = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
619 Acrowptr_RCP.resize(m+1); Acrowptr = Acrowptr_RCP();
620 Accolind_RCP.resize(nnzAllocated); Accolind = Accolind_RCP();
621 Acvals_RCP.resize(nnzAllocated); Acvals = Acvals_RCP();
631 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
632 Array<size_t> ac_status(n, ST_INVALID);
642 size_t nnz = 0, nnz_old = 0;
643 for (
size_t i = 0; i < m; i++) {
649 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
651 const SC Rik = Rvals[kk];
655 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
657 const SC Akl = Avals[ll];
662 if (Acol2PRow[l] != LO_INVALID) {
669 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
672 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
674 LO Acj = Pcol2Accol[j];
677 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
678 #ifdef HAVE_TPETRA_DEBUG
680 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
682 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
685 ac_status[Acj] = nnz;
687 Acvals[nnz] = Rik*Akl*Plj;
690 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
700 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
701 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
703 LO Acj = PIcol2Accol[j];
706 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
707 #ifdef HAVE_TPETRA_DEBUG
709 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
711 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
714 ac_status[Acj] = nnz;
716 Acvals[nnz] = Rik*Akl*Plj;
719 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
726 if (nnz + n > nnzAllocated) {
728 Accolind_RCP.resize(nnzAllocated); Accolind = Accolind_RCP();
729 Acvals_RCP.resize(nnzAllocated); Acvals = Acvals_RCP();
737 Acvals_RCP.resize(nnz);
738 Accolind_RCP.resize(nnz);
740 #ifdef HAVE_TPETRA_MMM_TIMINGS
741 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
748 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
750 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
752 #ifdef HAVE_TPETRA_MMM_TIMINGS
753 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
764 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
765 labelList->set(
"Timer Label",label);
766 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
767 RCP<const Export<LO,GO,NO> > dummyExport;
768 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
769 Rview.origMatrix->getRangeMap(),
776 #ifdef HAVE_TPETRA_INST_OPENMP
780 template<
class Scalar,
783 struct KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>
785 static inline void mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
786 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
787 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
788 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
789 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
790 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
791 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
792 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
793 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
794 const std::string& label,
795 const Teuchos::RCP<Teuchos::ParameterList>& params) {
797 using Tpetra::MatrixMatrix::UnmanagedView;
798 #ifdef HAVE_TPETRA_MMM_TIMINGS
799 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
800 using Teuchos::TimeMonitor;
801 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
804 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
806 typedef LocalOrdinal LO;
807 typedef GlobalOrdinal GO;
809 typedef Map<LO,GO,NO> map_type;
811 typedef typename KCRS::StaticCrsGraphType graph_t;
812 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
813 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
814 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
815 typedef typename KCRS::values_type::non_const_type scalar_view_t;
816 typedef typename KCRS::device_type device_t;
817 typedef typename device_t::execution_space execution_space;
818 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
821 typedef UnmanagedView<lno_view_t> u_lno_view_t;
822 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
823 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
825 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
826 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
829 RCP<const map_type> Accolmap = Ac.getColMap();
830 size_t m = Rview.origMatrix->getNodeNumRows();
831 size_t n = Accolmap->getNodeNumElements();
834 const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
835 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
836 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
838 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
839 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
840 lno_nnz_view_t Icolind;
841 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
844 if (!Pview.importMatrix.is_null())
846 const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
847 Irowptr = Imat.graph.row_map;
848 Icolind = Imat.graph.entries;
860 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
861 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
864 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
865 if(!params.is_null()) {
866 if(params->isParameter(
"openmp: ltg thread max"))
867 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
870 double thread_chunk = (double)(m) / thread_max;
880 Kokkos::View<u_lno_view_t*> tl_rowptr(
"top_rowptr", thread_max);
881 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind", thread_max);
882 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values", thread_max);
885 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
888 size_t my_thread_start = tid * thread_chunk;
889 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
890 size_t my_thread_m = my_thread_stop - my_thread_start;
892 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
894 std::vector<size_t> ac_status(n, INVALID);
897 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
898 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
899 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
902 size_t nnz = 0, nnz_old = 0;
904 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
905 Acrowptr(i - my_thread_start) = nnz;
907 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
909 const SC Rik = Rvals(kk);
913 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
915 const SC Akl = Avals(ll);
919 if (Acol2PRow[l] != LO_INVALID) {
926 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
929 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
931 LO Acj = Pcol2Accol[j];
934 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
935 #ifdef HAVE_TPETRA_DEBUG
937 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
939 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
942 ac_status[Acj] = nnz;
944 Acvals(nnz) = Rik*Akl*Plj;
947 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
957 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
958 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
960 LO Acj = PIcol2Accol[j];
963 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
964 #ifdef HAVE_TPETRA_DEBUG
966 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
968 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
971 ac_status[Acj] = nnz;
973 Acvals(nnz) = Rik*Akl*Plj;
976 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
983 if (nnz + n > nnzAllocated) {
985 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
986 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
990 Acrowptr(my_thread_m) = nnz;
991 tl_rowptr(tid) = Acrowptr;
992 tl_colind(tid) = Accolind;
993 tl_values(tid) = Acvals;
995 #ifdef HAVE_TPETRA_MMM_TIMINGS
996 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
999 lno_view_t rowmapAc(
"non_const_lnow_row", m + 1);
1000 lno_nnz_view_t entriesAc;
1001 scalar_view_t valuesAc;
1002 copy_out_from_thread_memory(tl_rowptr, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1004 for(
size_t i=0; i<thread_max; i++) {
1005 if(tl_rowptr(i).data()) free(tl_rowptr(i).data());
1006 if(tl_colind(i).data()) free(tl_colind(i).data());
1007 if(tl_values(i).data()) free(tl_values(i).data());
1010 #ifdef HAVE_TPETRA_MMM_TIMINGS
1011 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1015 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1017 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1019 #ifdef HAVE_TPETRA_MMM_TIMINGS
1020 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1031 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1032 labelList->set(
"Timer Label",label);
1033 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1034 RCP<const Export<LO,GO,NO> > dummyExport;
1035 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1036 Rview.origMatrix->getRangeMap(),
1046 static inline void mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1047 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1048 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1049 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1050 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1051 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1052 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1053 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1054 const std::string& label,
1055 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1056 #ifdef HAVE_TPETRA_MMM_TIMINGS
1057 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1058 using Teuchos::TimeMonitor;
1059 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix OpenMP"))));
1063 using Tpetra::MatrixMatrix::UnmanagedView;
1065 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1067 typedef LocalOrdinal LO;
1068 typedef GlobalOrdinal GO;
1070 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1072 typedef typename KCRS::StaticCrsGraphType graph_t;
1073 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1074 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1075 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1076 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1077 typedef typename KCRS::device_type device_t;
1078 typedef typename device_t::execution_space execution_space;
1079 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1082 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1083 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1084 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1086 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1087 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1092 size_t n = Ac.getRowMap()->getNodeNumElements();
1093 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1095 #ifdef HAVE_TPETRA_MMM_TIMINGS
1096 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1100 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1101 RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(
new Teuchos::ParameterList);
1102 if (!params.is_null())
1103 transposeParams->set(
"compute global constants",
1104 params->get(
"compute global constants: temporaries",
1106 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1109 const KCRS & Rmat = Ptrans->getLocalMatrix();
1110 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1111 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1113 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
1114 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
1115 lno_nnz_view_t Icolind;
1116 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1117 scalar_view_t Ivals;
1119 if (!Pview.importMatrix.is_null())
1121 const KCRS & Imat = Pview.importMatrix->getLocalMatrix();
1122 Irowptr = Imat.graph.row_map;
1123 Icolind = Imat.graph.entries;
1124 Ivals = Imat.values;
1127 #ifdef HAVE_TPETRA_MMM_TIMINGS
1128 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Alloc"))));
1131 size_t Acest_total_nnz = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix),
1132 Ac.getColMap()->getNodeNumElements());
1133 size_t Acest_nnz_per_row = std::ceil(Acest_total_nnz / n);
1134 size_t nnzPerRowA = 100;
1135 if (Aview.origMatrix->getNodeNumEntries() > 0)
1136 nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1146 const size_t ST_INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1149 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1150 if(!params.is_null()) {
1151 if(params->isParameter(
"openmp: ltg thread max"))
1152 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1155 double thread_chunk = (double)(n) / thread_max;
1158 #ifdef HAVE_TPETRA_MMM_TIMINGS
1159 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
1170 Kokkos::View<u_lno_view_t*> tl_rowptr(
"top_rowptr", thread_max);
1171 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind", thread_max);
1172 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values", thread_max);
1174 Kokkos::parallel_for(
"MMM::PTAP::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1177 size_t my_thread_start = tid * thread_chunk;
1178 size_t my_thread_stop = tid == thread_max-1 ? n : (tid+1)*thread_chunk;
1179 size_t my_thread_n = my_thread_stop - my_thread_start;
1181 size_t nnzAllocated = (size_t) (my_thread_n * Acest_nnz_per_row + 100);
1183 std::vector<size_t> ac_status(maxAccol + 1, ST_INVALID);
1186 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_n+1)), my_thread_n + 1);
1187 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1188 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1191 size_t nnz = 0, nnz_old = 0;
1193 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1194 Acrowptr(i - my_thread_start) = nnz;
1196 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1198 const SC Rik = Rvals(kk);
1202 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1204 const SC Akl = Avals(ll);
1208 if (Acol2PRow[l] != LO_INVALID) {
1215 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1218 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1223 LO Acj = Pcol2Accol[j];
1225 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1226 #ifdef HAVE_TPETRA_DEBUG
1228 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1230 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.extent(0) << std::endl);
1233 ac_status[Acj] = nnz;
1234 Accolind(nnz) = Acj;
1235 Acvals(nnz) = Rik*Akl*Plj;
1238 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1248 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1249 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1254 LO Acj = PIcol2Accol[j];
1256 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1257 #ifdef HAVE_TPETRA_DEBUG
1259 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1261 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1264 ac_status[Acj] = nnz;
1265 Accolind(nnz) = Acj;
1266 Acvals(nnz) = Rik*Akl*Plj;
1269 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1276 if (nnz + std::max(5*nnzPerRowA, n) > nnzAllocated) {
1278 nnzAllocated = std::max(nnzAllocated, nnz + std::max(5*nnzPerRowA, n));
1279 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1280 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1284 tl_rowptr(tid) = Acrowptr;
1285 tl_colind(tid) = Accolind;
1286 tl_values(tid) = Acvals;
1287 Acrowptr(my_thread_n) = nnz;
1290 #ifdef HAVE_TPETRA_MMM_TIMINGS
1291 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP copy from thread local"))));
1293 lno_view_t rowmapAc(
"non_const_lnow_row", n + 1);
1294 lno_nnz_view_t entriesAc;
1295 scalar_view_t valuesAc;
1296 copy_out_from_thread_memory(tl_rowptr, tl_colind, tl_values, n, thread_chunk, rowmapAc, entriesAc, valuesAc);
1298 #ifdef HAVE_TPETRA_MMM_TIMINGS
1299 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
1306 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1309 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1311 #ifdef HAVE_TPETRA_MMM_TIMINGS
1312 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
1323 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1324 labelList->set(
"Timer Label",label);
1326 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1327 RCP<const Export<LO,GO,NO> > dummyExport;
1328 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1329 Pview.origMatrix->getDomainMap(),
1331 dummyExport, labelList);
1338 template<
class Scalar,
1340 class GlobalOrdinal,
1342 void mult_PT_A_P_newmatrix(
1343 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1344 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1345 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1346 const std::string& label,
1347 const Teuchos::RCP<Teuchos::ParameterList>& params)
1349 using Teuchos::Array;
1350 using Teuchos::ArrayRCP;
1351 using Teuchos::ArrayView;
1356 typedef LocalOrdinal LO;
1357 typedef GlobalOrdinal GO;
1360 typedef Import<LO,GO,NO> import_type;
1361 typedef Map<LO,GO,NO> map_type;
1363 #ifdef HAVE_TPETRA_MMM_TIMINGS
1364 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1365 using Teuchos::TimeMonitor;
1366 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP")))));
1369 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1372 RCP<const import_type> Acimport;
1373 RCP<const map_type> Accolmap;
1374 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
1375 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ?
1376 Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
1383 Array<LO> Pcol2Accol(Pview.colMap->getNodeNumElements()), PIcol2Accol;
1385 if (Pview.importMatrix.is_null()) {
1388 Accolmap = Pview.colMap;
1390 for (
size_t i = 0; i < Pview.colMap->getNodeNumElements(); i++)
1391 Pcol2Accol[i] = Teuchos::as<LO>(i);
1402 if (!Pimport.is_null() && !Iimport.is_null())
1403 Acimport = Pimport->setUnion(*Iimport);
1405 else if (!Pimport.is_null() && Iimport.is_null())
1406 Acimport = Pimport->setUnion();
1408 else if (Pimport.is_null() && !Iimport.is_null())
1409 Acimport = Iimport->setUnion();
1412 throw std::runtime_error(
"TpetraExt::PTAP status of matrix importers is nonsensical");
1414 Accolmap = Acimport->getTargetMap();
1419 TEUCHOS_TEST_FOR_EXCEPTION(!Acimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
1420 std::runtime_error,
"Tpetra::PTAP: Import setUnion messed with the DomainMap in an unfortunate way");
1427 PIcol2Accol.resize(Pview.importMatrix->getColMap()->getNodeNumElements());
1428 ArrayView<const GO> Pgid = Pview.origMatrix->getColMap()->getNodeElementList();
1429 ArrayView<const GO> Igid = Pview.importMatrix->getColMap()->getNodeElementList();
1431 for (
size_t i = 0; i < Pview.origMatrix->getColMap()->getNodeNumElements(); i++)
1432 Pcol2Accol[i] = Accolmap->getLocalElement(Pgid[i]);
1434 for (
size_t i = 0; i < Pview.importMatrix->getColMap()->getNodeNumElements(); i++)
1435 PIcol2Accol[i] = Accolmap->getLocalElement(Igid[i]);
1443 Ac.replaceColMap(Accolmap);
1463 Array<LO> Acol2PRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
1464 Array<LO> Acol2PRowImport(Aview.colMap->getNodeNumElements(), LO_INVALID);
1465 for (LO A_LID = Aview.colMap->getMinLocalIndex(); A_LID <= Aview.colMap->getMaxLocalIndex(); A_LID++) {
1466 GO A_GID = Aview.colMap->getGlobalElement(A_LID);
1467 LO P_LID = Pview.origMatrix->getRowMap()->getLocalElement(A_GID);
1468 if (P_LID != LO_INVALID) {
1469 Acol2PRow[A_LID] = P_LID;
1471 LO I_LID = Pview.importMatrix->getRowMap()->getLocalElement(A_GID);
1472 Acol2PRowImport[A_LID] = I_LID;
1478 KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper(Aview,
1495 template<
class Scalar,
1497 class GlobalOrdinal,
1499 void KernelWrappers3MMM_Specialized<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1500 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1501 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1502 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1503 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1504 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1505 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1506 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1507 const std::string& label,
1508 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1509 #ifdef HAVE_TPETRA_MMM_TIMINGS
1510 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1511 using Teuchos::TimeMonitor;
1512 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
1515 using Teuchos::Array;
1516 using Teuchos::ArrayRCP;
1517 using Teuchos::ArrayView;
1522 typedef LocalOrdinal LO;
1523 typedef GlobalOrdinal GO;
1525 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1526 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1527 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1532 size_t n = Ac.getRowMap()->getNodeNumElements();
1533 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1536 ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1537 ArrayRCP<size_t> Acrowptr_RCP;
1538 ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1539 ArrayRCP<LO> Accolind_RCP;
1540 ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1541 ArrayRCP<SC> Acvals_RCP;
1548 Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1549 Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1551 if (!Pview.importMatrix.is_null())
1552 Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1559 ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1560 ArrayView<const LO> Acolind, Pcolind, Icolind;
1561 ArrayView<const SC> Avals, Pvals, Ivals;
1562 ArrayView<size_t> Acrowptr;
1563 ArrayView<LO> Accolind;
1564 ArrayView<SC> Acvals;
1565 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1566 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1567 if (!Pview.importMatrix.is_null()) {
1568 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1571 #ifdef HAVE_TPETRA_MMM_TIMINGS
1572 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1579 ArrayRCP<const size_t> Rrowptr_RCP;
1580 ArrayRCP<const LO> Rcolind_RCP;
1581 ArrayRCP<const Scalar> Rvals_RCP;
1582 ArrayView<const size_t> Rrowptr;
1583 ArrayView<const LO> Rcolind;
1584 ArrayView<const SC> Rvals;
1586 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1587 RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(
new Teuchos::ParameterList);
1588 if (!params.is_null())
1589 transposeParams->set(
"compute global constants",
1590 params->get(
"compute global constants: temporaries",
1592 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1594 Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1595 Rrowptr = Rrowptr_RCP();
1596 Rcolind = Rcolind_RCP();
1597 Rvals = Rvals_RCP();
1599 #ifdef HAVE_TPETRA_MMM_TIMINGS
1600 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Alloc"))));
1603 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix),
1604 Ac.getColMap()->getNodeNumElements());
1605 size_t nnzPerRowA = 100;
1606 if (Aview.origMatrix->getNodeNumEntries() > 0)
1607 nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1608 Acrowptr_RCP.resize(n+1); Acrowptr = Acrowptr_RCP();
1609 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1610 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1628 const size_t ST_INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1629 Array<size_t> ac_status(maxAccol+1, ST_INVALID);
1632 #ifdef HAVE_TPETRA_MMM_TIMINGS
1633 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
1643 size_t nnz = 0, nnz_old = 0;
1646 for (
size_t i = 0; i < n; i++) {
1652 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1654 const SC Rik = Rvals[kk];
1658 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1660 const SC Akl = Avals[ll];
1664 if (Acol2PRow[l] != LO_INVALID) {
1671 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1674 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1679 LO Acj = Pcol2Accol[j];
1681 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1682 #ifdef HAVE_TPETRA_DEBUG
1684 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1686 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1689 ac_status[Acj] = nnz;
1690 Accolind[nnz] = Acj;
1691 Acvals[nnz] = Rik*Akl*Plj;
1694 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
1704 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1705 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1710 LO Acj = PIcol2Accol[j];
1712 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1713 #ifdef HAVE_TPETRA_DEBUG
1715 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1717 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1720 ac_status[Acj] = nnz;
1721 Accolind[nnz] = Acj;
1722 Acvals[nnz] = Rik*Akl*Plj;
1725 Acvals[ac_status[Acj]] += Rik*Akl*Plj;
1732 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1734 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1735 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1736 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1744 Acvals_RCP.resize(nnz);
1745 Accolind_RCP.resize(nnz);
1747 #ifdef HAVE_TPETRA_MMM_TIMINGS
1748 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
1755 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1758 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1760 #ifdef HAVE_TPETRA_MMM_TIMINGS
1761 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
1772 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1773 labelList->set(
"Timer Label",label);
1775 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1776 RCP<const Export<LO,GO,NO> > dummyExport;
1777 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1778 Pview.origMatrix->getDomainMap(),
1780 dummyExport, labelList);
1789 template<
class Scalar,
1791 class GlobalOrdinal,
1793 void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1794 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1795 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1796 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1797 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1798 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1799 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1800 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1801 const std::string& label,
1802 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1803 #ifdef HAVE_TPETRA_MMM_TIMINGS
1804 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1805 using Teuchos::TimeMonitor;
1806 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
1809 using Teuchos::Array;
1810 using Teuchos::ArrayRCP;
1811 using Teuchos::ArrayView;
1816 typedef LocalOrdinal LO;
1817 typedef GlobalOrdinal GO;
1819 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1820 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1821 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1826 size_t n = Ac.getRowMap()->getNodeNumElements();
1827 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1830 ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1831 ArrayRCP<size_t> Acrowptr_RCP;
1832 ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1833 ArrayRCP<LO> Accolind_RCP;
1834 ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1835 ArrayRCP<SC> Acvals_RCP;
1842 Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1843 Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1845 if (!Pview.importMatrix.is_null())
1846 Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1853 ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1854 ArrayView<const LO> Acolind, Pcolind, Icolind;
1855 ArrayView<const SC> Avals, Pvals, Ivals;
1856 ArrayView<size_t> Acrowptr;
1857 ArrayView<LO> Accolind;
1858 ArrayView<SC> Acvals;
1859 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1860 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1861 if (!Pview.importMatrix.is_null()) {
1862 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1874 #ifdef HAVE_TPETRA_MMM_TIMINGS
1875 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1882 ArrayRCP<const size_t> Rrowptr_RCP;
1883 ArrayRCP<const LO> Rcolind_RCP;
1884 ArrayRCP<const Scalar> Rvals_RCP;
1885 ArrayView<const size_t> Rrowptr;
1886 ArrayView<const LO> Rcolind;
1887 ArrayView<const SC> Rvals;
1889 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1890 RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(
new Teuchos::ParameterList);
1891 if (!params.is_null())
1892 transposeParams->set(
"compute global constants",
1893 params->get(
"compute global constants: temporaries",
1895 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1897 Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1898 Rrowptr = Rrowptr_RCP();
1899 Rcolind = Rcolind_RCP();
1900 Rvals = Rvals_RCP();
1905 #ifdef HAVE_TPETRA_MMM_TIMINGS
1906 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP graph"))));
1909 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1910 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1912 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1913 size_t nnzPerRowA = 100;
1914 if (Aview.origMatrix->getNodeNumEntries() > 0)
1915 nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1916 Acrowptr_RCP.resize(n+1);
1917 Acrowptr = Acrowptr_RCP();
1918 Accolind_RCP.resize(nnz_alloc);
1919 Accolind = Accolind_RCP();
1921 size_t nnz = 0, nnz_old = 0;
1922 for (
size_t i = 0; i < n; i++) {
1928 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1931 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1934 if (Acol2PRow[l] != LO_INVALID) {
1941 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1944 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1946 LO Acj = Pcol2Accol[j];
1948 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1950 ac_status[Acj] = nnz;
1951 Accolind[nnz] = Acj;
1962 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1963 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1965 LO Acj = PIcol2Accol[j];
1967 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1969 ac_status[Acj] = nnz;
1970 Accolind[nnz] = Acj;
1980 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1982 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1983 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1984 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1991 Accolind_RCP.resize(nnz);
1992 Accolind = Accolind_RCP();
1995 Acvals_RCP.resize(nnz, SC_ZERO);
1996 Acvals = Acvals_RCP();
2003 #ifdef HAVE_TPETRA_MMM_TIMINGS
2004 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
2008 for (
size_t k = 0; k < n; k++) {
2009 for (
size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
2011 const SC Pki = Pvals[ii];
2012 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
2014 const SC Akl = Avals[ll];
2017 if (Acol2PRow[l] != LO_INVALID) {
2024 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
2025 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
2027 LO Acj = Pcol2Accol[j];
2029 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
2030 if (Accolind[pp] == Acj)
2034 Acvals[pp] += Pki*Akl*Pvals[jj];
2043 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
2044 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
2046 LO Acj = PIcol2Accol[j];
2048 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
2049 if (Accolind[pp] == Acj)
2053 Acvals[pp] += Pki*Akl*Ivals[jj];
2061 #ifdef HAVE_TPETRA_MMM_TIMINGS
2062 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
2069 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
2072 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
2074 #ifdef HAVE_TPETRA_MMM_TIMINGS
2075 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
2086 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2087 labelList->set(
"Timer Label",label);
2089 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2090 RCP<const Export<LO,GO,NO> > dummyExport;
2091 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
2092 Pview.origMatrix->getDomainMap(),
2094 dummyExport, labelList);
2108 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
2111 void TripleMatrixMultiply::MultiplyRAP( \
2112 const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
2114 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2116 const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
2118 CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
2119 bool call_FillComplete_on_result, \
2120 const std::string & label, \
2121 const Teuchos::RCP<Teuchos::ParameterList>& params); \
2125 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP