42 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
55 #include "Kokkos_Core.hpp"
82 #ifndef DOXYGEN_SHOULD_SKIP_THIS
85 #endif // DOXYGEN_SHOULD_SKIP_THIS
92 namespace UnpackAndCombineCrsMatrixImpl {
106 template<
class ST,
class LO,
class GO,
class DT,
class BDT>
111 const Kokkos::View<const char*, BDT>& imports,
113 const size_t num_bytes,
114 const size_t num_ent,
115 const size_t num_bytes_per_value)
121 bool unpack_pids = pids_out.size() > 0;
123 const size_t num_ent_beg = offset;
126 const size_t gids_beg = num_ent_beg + num_ent_len;
127 const size_t gids_len =
130 const size_t pids_beg = gids_beg + gids_len;
131 const size_t pids_len = unpack_pids ?
135 const size_t vals_beg = gids_beg + gids_len + pids_len;
136 const size_t vals_len = num_ent * num_bytes_per_value;
138 const char*
const num_ent_in = imports.data () + num_ent_beg;
139 const char*
const gids_in = imports.data () + gids_beg;
140 const char*
const pids_in = unpack_pids ? imports.data () + pids_beg : NULL;
141 const char*
const vals_in = imports.data () + vals_beg;
143 size_t num_bytes_out = 0;
146 if (static_cast<size_t> (num_ent_out) != num_ent) {
151 Kokkos::pair<int, size_t> p;
156 num_bytes_out += p.second;
163 num_bytes_out += p.second;
170 num_bytes_out += p.second;
173 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
174 if (num_bytes_out != expected_num_bytes) {
190 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
192 typedef LocalMatrix local_matrix_type;
195 typedef typename local_matrix_type::value_type ST;
196 typedef typename local_map_type::local_ordinal_type LO;
197 typedef typename local_map_type::global_ordinal_type GO;
198 typedef typename local_map_type::device_type DT;
199 typedef typename DT::execution_space XS;
201 typedef Kokkos::View<const size_t*, BufferDeviceType>
202 num_packets_per_lid_type;
203 typedef Kokkos::View<const size_t*, DT> offsets_type;
204 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
205 typedef Kokkos::View<const LO*, DT> import_lids_type;
207 typedef Kokkos::View<LO*, DT> lids_scratch_type;
208 typedef Kokkos::View<GO*, DT> gids_scratch_type;
209 typedef Kokkos::View<int*,DT> pids_scratch_type;
210 typedef Kokkos::View<ST*, DT> vals_scratch_type;
212 typedef Kokkos::pair<int, LO> value_type;
214 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
215 "LocalMap::local_ordinal_type and "
216 "LocalMatrix::ordinal_type must be the same.");
218 local_matrix_type local_matrix;
220 input_buffer_type imports;
221 num_packets_per_lid_type num_packets_per_lid;
222 import_lids_type import_lids;
223 offsets_type offsets;
227 size_t num_bytes_per_value;
229 Kokkos::Experimental::UniqueToken<XS, Kokkos::Experimental::UniqueTokenScope::Global> tokens;
230 lids_scratch_type lids_scratch;
231 gids_scratch_type gids_scratch;
232 pids_scratch_type pids_scratch;
233 vals_scratch_type vals_scratch;
236 const local_matrix_type& local_matrix_in,
238 const input_buffer_type& imports_in,
239 const num_packets_per_lid_type& num_packets_per_lid_in,
240 const import_lids_type& import_lids_in,
241 const offsets_type& offsets_in,
243 const size_t max_num_ent_in,
244 const bool unpack_pids_in,
245 const size_t num_bytes_per_value_in,
246 const bool atomic_in) :
247 local_matrix (local_matrix_in),
248 local_col_map (local_col_map_in),
249 imports (imports_in),
250 num_packets_per_lid (num_packets_per_lid_in),
251 import_lids (import_lids_in),
252 offsets (offsets_in),
253 combine_mode (combine_mode_in),
254 max_num_ent (max_num_ent_in),
255 unpack_pids (unpack_pids_in),
256 num_bytes_per_value (num_bytes_per_value_in),
259 lids_scratch (
"pids_scratch", tokens.size() * max_num_ent),
260 gids_scratch (
"gids_scratch", tokens.size() * max_num_ent),
261 pids_scratch (
"lids_scratch", tokens.size() * max_num_ent),
262 vals_scratch (
"vals_scratch", tokens.size() * max_num_ent)
265 KOKKOS_INLINE_FUNCTION
void init(value_type& dst)
const
267 using Tpetra::Details::OrdinalTraits;
268 dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
271 KOKKOS_INLINE_FUNCTION
void
272 join (
volatile value_type& dst,
const volatile value_type& src)
const
278 using Tpetra::Details::OrdinalTraits;
279 if (src.second != OrdinalTraits<LO>::invalid ()) {
284 if (dst.second == OrdinalTraits<LO>::invalid () ||
285 src.second < dst.second) {
291 KOKKOS_INLINE_FUNCTION
292 void operator()(
const LO i, value_type& dst)
const
295 using Kokkos::subview;
296 using Kokkos::MemoryUnmanaged;
297 typedef typename XS::size_type size_type;
298 typedef typename Kokkos::pair<size_type, size_type> slice;
299 typedef BufferDeviceType BDT;
301 typedef View<LO*, DT, MemoryUnmanaged> lids_out_type;
302 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
303 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
304 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
306 const size_t num_bytes = num_packets_per_lid(i);
309 if (num_bytes == 0) {
314 const LO import_lid = import_lids[i];
315 const size_t buf_size = imports.size();
316 const size_t offset = offsets(i);
320 const char*
const in_buf = imports.data () + offset;
322 const size_t num_ent = static_cast<size_t> (num_ent_LO);
325 size_t expected_num_bytes = 0;
335 if (expected_num_bytes > num_bytes) {
336 dst = Kokkos::make_pair (1, i);
340 if (offset > buf_size || offset + num_bytes > buf_size) {
341 dst = Kokkos::make_pair (2, i);
348 const size_type token = tokens.acquire();
349 const size_t a = static_cast<size_t>(token) * max_num_ent;
350 const size_t b = a + num_ent;
351 lids_out_type lids_out = subview(lids_scratch, slice(a, b));
352 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
353 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
354 vals_out_type vals_out = subview(vals_scratch, slice(a, b));
358 unpackRow<ST,LO,GO,DT,BDT>(gids_out, pids_out, vals_out,
359 imports, offset, num_bytes,
360 num_ent, num_bytes_per_value);
361 if (unpack_err != 0) {
362 dst = Kokkos::make_pair (unpack_err, i);
363 tokens.release (token);
370 for (
size_t k = 0; k < num_ent; ++k) {
375 const LO*
const lids_raw = const_cast<const LO*> (lids_out.data ());
376 const ST*
const vals_raw = const_cast<const ST*> (vals_out.data ());
378 if (combine_mode ==
ADD) {
380 local_matrix.sumIntoValues (import_lid, lids_raw, num_ent,
381 vals_raw,
false, atomic);
383 else if (combine_mode ==
REPLACE) {
385 local_matrix.replaceValues (import_lid, lids_raw, num_ent,
386 vals_raw,
false, atomic);
389 dst = Kokkos::make_pair (4, i);
390 tokens.release (token);
394 tokens.release (token);
398 struct MaxNumEntTag {};
399 struct TotNumEntTag {};
409 template<
class LO,
class DT,
class BDT>
412 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
413 typedef Kokkos::View<const size_t*, DT> offsets_type;
414 typedef Kokkos::View<const char*, BDT> input_buffer_type;
417 typedef size_t value_type;
420 typedef Kokkos::pair<size_t,size_t> slice;
422 num_packets_per_lid_type num_packets_per_lid;
423 offsets_type offsets;
424 input_buffer_type imports;
428 const offsets_type& offsets_in,
429 const input_buffer_type& imports_in) :
430 num_packets_per_lid (num_packets_per_lid_in),
431 offsets (offsets_in),
435 KOKKOS_INLINE_FUNCTION
void
436 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
438 const size_t num_bytes = num_packets_per_lid(i);
441 const char*
const in_buf = imports.data () + offsets(i);
443 const size_t num_ent = static_cast<size_t> (num_ent_LO);
445 update = (update < num_ent) ? num_ent : update;
449 KOKKOS_INLINE_FUNCTION
void
450 join (
const MaxNumEntTag,
451 volatile value_type& dst,
452 const volatile value_type& src)
const
454 if (dst < src) dst = src;
457 KOKKOS_INLINE_FUNCTION
void
458 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
460 const size_t num_bytes = num_packets_per_lid(i);
463 const char*
const in_buf = imports.data () + offsets(i);
465 tot_num_ent += static_cast<size_t> (num_ent_LO);
477 template<
class LO,
class DT,
class BDT>
480 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
481 const Kokkos::View<const size_t*, DT>& offsets,
482 const Kokkos::View<const char*, BDT>& imports)
484 typedef typename DT::execution_space XS;
485 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
486 MaxNumEntTag> range_policy;
490 const LO numRowsToUnpack =
491 static_cast<LO> (num_packets_per_lid.extent (0));
492 size_t max_num_ent = 0;
493 Kokkos::parallel_reduce (
"Max num entries in CRS",
494 range_policy (0, numRowsToUnpack),
495 functor, max_num_ent);
506 template<
class LO,
class DT,
class BDT>
509 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
510 const Kokkos::View<const size_t*, DT>& offsets,
511 const Kokkos::View<const char*, BDT>& imports)
513 typedef typename DT::execution_space XS;
514 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
515 size_t tot_num_ent = 0;
518 const LO numRowsToUnpack =
519 static_cast<LO> (num_packets_per_lid.extent (0));
520 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
521 range_policy (0, numRowsToUnpack),
522 functor, tot_num_ent);
533 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
536 const LocalMatrix& local_matrix,
538 const Kokkos::View<const char*, BufferDeviceType>& imports,
539 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
542 const bool unpack_pids,
545 typedef typename LocalMatrix::value_type ST;
546 typedef typename LocalMap::local_ordinal_type LO;
547 typedef typename LocalMap::device_type DT;
548 typedef typename DT::execution_space XS;
549 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
552 const char prefix[] =
553 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix: ";
555 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
556 if (num_import_lids == 0) {
563 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
564 std::invalid_argument,
565 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
566 "static graph (i.e., was constructed with the CrsMatrix constructor "
567 "that takes a const CrsGraph pointer).");
569 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
570 std::invalid_argument,
571 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
572 "(i.e., was constructed with the CrsMatrix constructor that takes a "
573 "const CrsGraph pointer).");
576 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
577 std::invalid_argument,
578 prefix <<
"Invalid combine mode; should never get "
579 "here! Please report this bug to the Tpetra developers.");
582 bool bad_num_import_lids =
583 num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
584 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
585 std::invalid_argument,
586 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
587 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
591 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
597 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(
598 num_packets_per_lid, offsets, imports);
605 unpack_functor_type f(local_matrix, local_map,
606 imports, num_packets_per_lid, import_lids, offsets, combine_mode,
607 max_num_ent, unpack_pids, num_bytes_per_value, atomic);
609 typename unpack_functor_type::value_type x;
610 Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
611 auto x_h = x.to_std_pair();
612 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
613 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code "
614 << x_h.first <<
" for the first bad row " << x_h.second);
619 template<
class LocalMatrix,
class BufferDeviceType>
622 const LocalMatrix& local_matrix,
624 const Kokkos::View<const char*, BufferDeviceType>& imports,
625 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
626 const size_t num_same_ids)
628 using Kokkos::parallel_reduce;
629 typedef typename LocalMatrix::ordinal_type LO;
630 typedef typename LocalMatrix::device_type device_type;
631 typedef typename device_type::execution_space XS;
632 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
633 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
634 typedef BufferDeviceType BDT;
640 num_items = static_cast<LO>(num_same_ids);
643 parallel_reduce(range_policy(0, num_items),
644 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
645 update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
646 -local_matrix.graph.row_map[lid]);
652 num_items = static_cast<LO>(permute_from_lids.extent(0));
655 parallel_reduce(range_policy(0, num_items),
656 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
657 const LO lid = permute_from_lids(i);
658 update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
659 - local_matrix.graph.row_map[lid]);
666 const size_type np = num_packets_per_lid.extent(0);
667 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
670 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
677 template<
class LO,
class DT,
class BDT>
678 KOKKOS_INLINE_FUNCTION
680 unpackRowCount(
const Kokkos::View<const char*, BDT>& imports,
682 const size_t num_bytes)
687 if (p_num_bytes > num_bytes) {
688 return OrdinalTraits<size_t>::invalid();
690 const char*
const in_buf = imports.data () + offset;
693 return static_cast<size_t>(num_ent_LO);
697 template<
class LO,
class DT,
class BDT>
699 setupRowPointersForRemotes(
702 const Kokkos::View<const char*, BDT>& imports,
703 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
706 using Kokkos::parallel_reduce;
707 typedef typename DT::execution_space XS;
709 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
711 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
712 const size_type N = num_packets_per_lid.extent(0);
715 parallel_reduce (
"Setup row pointers for remotes",
717 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
718 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
719 const size_t num_bytes = num_packets_per_lid(i);
720 const size_t offset = offsets(i);
721 const size_t num_ent = unpackRowCount<LO, DT, BDT> (imports, offset, num_bytes);
722 if (num_ent == InvalidNum) {
725 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
733 makeCrsRowPtrFromLengths(
735 const Kokkos::View<size_t*,DT>& new_start_row)
737 using Kokkos::parallel_scan;
738 typedef typename DT::execution_space XS;
739 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
740 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
741 const size_type N = new_start_row.extent(0);
742 parallel_scan(range_policy(0, N),
743 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
744 auto cur_val = tgt_rowptr(i);
746 tgt_rowptr(i) = update;
747 new_start_row(i) = tgt_rowptr(i);
754 template<
class LocalMatrix,
class LocalMap>
760 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
763 const LocalMatrix& local_matrix,
765 const size_t num_same_ids,
768 using Kokkos::parallel_for;
769 typedef typename LocalMap::device_type DT;
770 typedef typename LocalMap::local_ordinal_type LO;
771 typedef typename DT::execution_space XS;
772 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
774 parallel_for(range_policy(0, num_same_ids),
775 KOKKOS_LAMBDA(
const size_t i) {
776 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
778 const LO src_lid = static_cast<LO>(i);
779 size_t src_row = local_matrix.graph.row_map(src_lid);
781 const LO tgt_lid = static_cast<LO>(i);
782 const size_t tgt_row = tgt_rowptr(tgt_lid);
784 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
785 - local_matrix.graph.row_map(src_lid);
786 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
788 for (
size_t j=local_matrix.graph.row_map(src_lid);
789 j<local_matrix.graph.row_map(src_lid+1); ++j) {
790 LO src_col = local_matrix.graph.entries(j);
791 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
792 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
793 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
799 template<
class LocalMatrix,
class LocalMap>
801 copyDataFromPermuteIDs(
805 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
810 const LocalMatrix& local_matrix,
814 using Kokkos::parallel_for;
815 typedef typename LocalMap::device_type DT;
816 typedef typename LocalMap::local_ordinal_type LO;
817 typedef typename DT::execution_space XS;
818 typedef typename PackTraits<LO,DT>::input_array_type::size_type size_type;
819 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
821 const size_type num_permute_to_lids = permute_to_lids.extent(0);
823 parallel_for(range_policy(0, num_permute_to_lids),
824 KOKKOS_LAMBDA(
const size_t i) {
825 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
827 const LO src_lid = permute_from_lids(i);
828 const size_t src_row = local_matrix.graph.row_map(src_lid);
830 const LO tgt_lid = permute_to_lids(i);
831 const size_t tgt_row = tgt_rowptr(tgt_lid);
833 size_t nsr = local_matrix.graph.row_map(src_lid+1)
834 - local_matrix.graph.row_map(src_lid);
835 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
837 for (
size_t j=local_matrix.graph.row_map(src_lid);
838 j<local_matrix.graph.row_map(src_lid+1); ++j) {
839 LO src_col = local_matrix.graph.entries(j);
840 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
841 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
842 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
848 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
850 unpackAndCombineIntoCrsArrays2(
854 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
857 const Kokkos::View<const char*, BufferDeviceType>& imports,
858 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
859 const LocalMatrix& local_matrix,
862 const size_t num_bytes_per_value)
865 using Kokkos::subview;
866 using Kokkos::MemoryUnmanaged;
867 using Kokkos::parallel_reduce;
868 using Kokkos::atomic_fetch_add;
870 typedef typename LocalMap::device_type DT;
871 typedef typename LocalMap::local_ordinal_type LO;
872 typedef typename LocalMap::global_ordinal_type GO;
873 typedef typename LocalMatrix::value_type ST;
874 typedef typename DT::execution_space XS;
875 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
876 typedef typename Kokkos::pair<size_type, size_type> slice;
877 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
878 typedef BufferDeviceType BDT;
880 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
881 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
882 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
884 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
887 const size_type num_import_lids = import_lids.size();
890 parallel_reduce (
"Unpack and combine into CRS",
891 range_policy (0, num_import_lids),
892 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
893 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
894 const size_t num_bytes = num_packets_per_lid(i);
895 const size_t offset = offsets(i);
896 if (num_bytes == 0) {
900 size_t num_ent = unpackRowCount<LO,DT,BDT>(imports, offset, num_bytes);
901 if (num_ent == InvalidNum) {
905 const LO lcl_row = import_lids(i);
906 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
907 const size_t end_row = start_row + num_ent;
909 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
910 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
911 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
913 k_error += unpackRow<ST,LO,GO,DT,BDT>(gids_out, pids_out, vals_out,
914 imports, offset, num_bytes,
915 num_ent, num_bytes_per_value);
918 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
919 const int pid = pids_out(j);
920 pids_out(j) = (pid != my_pid) ? pid : -1;
927 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
930 const LocalMatrix & local_matrix,
933 const Kokkos::View<const char*, BufferDeviceType>& imports,
934 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
942 const size_t num_same_ids,
943 const size_t tgt_num_rows,
944 const size_t tgt_num_nonzeros,
945 const int my_tgt_pid,
946 const size_t num_bytes_per_value)
949 using Kokkos::subview;
950 using Kokkos::parallel_for;
951 using Kokkos::MemoryUnmanaged;
953 typedef typename LocalMap::device_type DT;
954 typedef typename LocalMap::local_ordinal_type LO;
955 typedef typename DT::execution_space XS;
956 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
957 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
958 typedef BufferDeviceType BDT;
960 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
962 const size_t N = tgt_num_rows;
963 const size_t mynnz = tgt_num_nonzeros;
967 const int my_pid = my_tgt_pid;
970 parallel_for(range_policy(0, N+1),
971 KOKKOS_LAMBDA(
const size_t i) {
977 parallel_for(range_policy(0, num_same_ids),
978 KOKKOS_LAMBDA(
const size_t i) {
979 const LO tgt_lid = static_cast<LO>(i);
980 const LO src_lid = static_cast<LO>(i);
981 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
982 - local_matrix.graph.row_map(src_lid);
987 const size_type num_permute_to_lids = permute_to_lids.extent(0);
988 parallel_for(range_policy(0, num_permute_to_lids),
989 KOKKOS_LAMBDA(
const size_t i) {
990 const LO tgt_lid = permute_to_lids(i);
991 const LO src_lid = permute_from_lids(i);
992 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
993 - local_matrix.graph.row_map(src_lid);
998 const size_type num_import_lids = import_lids.extent(0);
999 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1002 #ifdef HAVE_TPETRA_DEBUG
1004 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1005 const bool condition =
1006 nth_offset_h != static_cast<size_t>(imports.extent (0));
1007 TEUCHOS_TEST_FOR_EXCEPTION
1008 (condition, std::logic_error, prefix
1009 <<
"The final offset in bytes " << nth_offset_h
1010 <<
" != imports.size() = " << imports.extent(0)
1011 <<
". Please report this bug to the Tpetra developers.");
1013 #endif // HAVE_TPETRA_DEBUG
1017 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1018 import_lids, imports, num_packets_per_lid, offsets);
1019 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1020 <<
" Error transferring data to target row pointers. "
1021 "Please report this bug to the Tpetra developers.");
1025 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1028 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1030 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
1031 bool condition = nth_tgt_rowptr_h != mynnz;
1032 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
1033 prefix <<
"CRS_rowptr[last] = " <<
1034 nth_tgt_rowptr_h <<
"!= mynnz = " << mynnz <<
".");
1038 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1039 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1041 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1042 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1043 local_matrix, local_col_map, my_pid);
1045 if (imports.extent(0) <= 0) {
1049 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1050 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1051 local_matrix, local_col_map, my_pid, num_bytes_per_value);
1052 TEUCHOS_TEST_FOR_EXCEPTION(
1053 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1054 "should never happen. Please report this bug to the Tpetra developers.");
1100 template<
typename ST,
typename LO,
typename GO,
typename Node>
1104 const Teuchos::ArrayView<const char>& imports,
1105 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1106 const Teuchos::ArrayView<const LO>& importLIDs,
1107 size_t constantNumPackets,
1113 typedef typename Node::device_type device_type;
1115 static_assert (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1116 "Node::device_type and LocalMatrix::device_type must be the same.");
1119 typedef typename device_type::execution_space XS;
1122 typename XS::device_type outputDevice;
1127 auto num_packets_per_lid_d =
1129 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1131 auto import_lids_d =
1133 importLIDs.size(),
true,
"import_lids");
1137 imports.size(),
true,
"imports");
1140 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1143 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1144 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1145 import_lids_d, combineMode,
false, atomic);
1150 template<
typename ST,
typename LO,
typename GO,
typename NT>
1155 const Kokkos::DualView<const LO*, typename NT::device_type>& importLIDs,
1156 const size_t constantNumPackets,
1163 typedef typename NT::device_type device_type;
1165 typedef typename crs_matrix_type::local_matrix_type local_matrix_type;
1167 typedef typename dist_object_type::buffer_device_type buffer_device_type;
1168 typedef typename buffer_device_type::memory_space BMS;
1169 typedef typename device_type::memory_space MS;
1171 static_assert (std::is_same<device_type,
1172 typename local_matrix_type::device_type>::value,
1173 "NT::device_type and LocalMatrix::device_type must be "
1178 numPacketsPerLID_nc.template sync<BMS> ();
1180 auto num_packets_per_lid_d = numPacketsPerLID.template view<BMS> ();
1184 importLIDs_nc.template sync<MS> ();
1186 auto import_lids_d = importLIDs.template view<MS> ();
1190 imports_nc.template sync<BMS> ();
1192 auto imports_d = imports.template view<BMS> ();
1195 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1196 typedef decltype (local_col_map) local_map_type;
1199 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1203 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1204 import_lids_d, combineMode,
false, atomic);
1262 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1266 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1267 const Teuchos::ArrayView<const char> &imports,
1268 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1269 size_t constantNumPackets,
1273 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1274 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1276 using Kokkos::MemoryUnmanaged;
1278 typedef typename Node::device_type DT;
1280 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1282 TEUCHOS_TEST_FOR_EXCEPTION
1283 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1284 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1285 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1289 TEUCHOS_TEST_FOR_EXCEPTION
1290 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1291 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1292 TEUCHOS_TEST_FOR_EXCEPTION
1293 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1294 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1295 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1298 auto permute_from_lids_d =
1300 permuteFromLIDs.getRawPtr (),
1301 permuteFromLIDs.size (),
true,
1302 "permute_from_lids");
1305 imports.getRawPtr (),
1306 imports.size (),
true,
1308 auto num_packets_per_lid_d =
1310 numPacketsPerLID.getRawPtr (),
1311 numPacketsPerLID.size (),
true,
1312 "num_packets_per_lid");
1314 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1315 local_matrix, permute_from_lids_d, imports_d,
1316 num_packets_per_lid_d, numSameIDs);
1333 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1337 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1338 const Teuchos::ArrayView<const char>& imports,
1339 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1340 const size_t constantNumPackets,
1343 const size_t numSameIDs,
1344 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1345 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1346 size_t TargetNumRows,
1347 size_t TargetNumNonzeros,
1348 const int MyTargetPID,
1349 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1350 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1352 const Teuchos::ArrayView<const int>& SourcePids,
1353 Teuchos::Array<int>& TargetPids)
1360 using Teuchos::ArrayView;
1361 using Teuchos::outArg;
1362 using Teuchos::REDUCE_MAX;
1363 using Teuchos::reduceAll;
1365 typedef LocalOrdinal LO;
1367 typedef typename Node::device_type DT;
1368 typedef typename DT::execution_space XS;
1371 typedef typename matrix_type::impl_scalar_type ST;
1372 typedef typename ArrayView<const LO>::size_type size_type;
1374 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1376 TEUCHOS_TEST_FOR_EXCEPTION(
1377 TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1378 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1379 CRS_rowptr.size () <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1381 TEUCHOS_TEST_FOR_EXCEPTION(
1382 permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1383 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size ()
1384 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size () <<
".");
1385 const size_type numImportLIDs = importLIDs.size ();
1387 TEUCHOS_TEST_FOR_EXCEPTION(
1388 numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1389 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != "
1390 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1393 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1394 TargetPids.resize (TargetNumNonzeros);
1396 TargetPids.assign (TargetNumNonzeros, -1);
1400 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1403 typename XS::device_type outputDevice;
1404 auto import_lids_d =
1406 importLIDs.size(),
true,
"import_lids");
1410 imports.size(),
true,
"imports");
1412 auto num_packets_per_lid_d =
1414 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1416 auto permute_from_lids_d =
1418 permuteFromLIDs.size(),
true,
"permute_from_lids");
1420 auto permute_to_lids_d =
1422 permuteToLIDs.size(),
true,
"permute_to_lids");
1426 CRS_rowptr.size(),
true,
"crs_rowptr");
1430 CRS_colind.size(),
true,
"crs_colidx");
1432 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1433 static_assert (! std::is_same<
1434 typename std::remove_const<
1435 typename std::decay<
1439 std::complex<double> >::value,
1440 "CRS_vals::value_type is std::complex<double>; this should never happen"
1441 ", since std::complex does not work in Kokkos::View objects.");
1442 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1446 CRS_vals.size(),
true,
"crs_vals");
1448 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1449 static_assert (! std::is_same<
1450 typename decltype (crs_vals_d)::non_const_value_type,
1451 std::complex<double> >::value,
1452 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1453 "never happen, since std::complex does not work in Kokkos::View objects.");
1454 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1458 SourcePids.size(),
true,
"src_pids");
1462 TargetPids.size(),
true,
"tgt_pids");
1464 size_t num_bytes_per_value = 0;
1478 size_t num_bytes_per_value_l = 0;
1479 if (local_matrix.values.extent(0) > 0) {
1480 const ST& val = local_matrix.values(0);
1483 const ST& val = crs_vals_d(0);
1486 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1487 Teuchos::REDUCE_MAX,
1488 num_bytes_per_value_l,
1489 outArg(num_bytes_per_value));
1492 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1493 static_assert (! std::is_same<
1494 typename decltype (crs_vals_d)::non_const_value_type,
1495 std::complex<double> >::value,
1496 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1497 "never happen, since std::complex does not work in Kokkos::View objects.");
1498 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1500 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1501 local_matrix, local_col_map, import_lids_d, imports_d,
1502 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1503 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1504 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1505 num_bytes_per_value);
1508 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1509 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1512 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1513 CRS_colind.getRawPtr(), CRS_colind.size());
1516 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1517 CRS_vals.getRawPtr(), CRS_vals.size());
1520 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1521 TargetPids.getRawPtr(), TargetPids.size());
1529 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1531 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1532 const CrsMatrix<ST, LO, GO, NT>&, \
1533 const Teuchos::ArrayView<const char>&, \
1534 const Teuchos::ArrayView<const size_t>&, \
1535 const Teuchos::ArrayView<const LO>&, \
1541 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1542 const CrsMatrix<ST, LO, GO, NT>&, \
1543 const Kokkos::DualView<const char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1544 const Kokkos::DualView<const size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1545 const Kokkos::DualView<const LO*, NT::device_type>&, \
1548 const CombineMode, \
1551 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1552 const CrsMatrix<ST, LO, GO, NT> &, \
1553 const Teuchos::ArrayView<const LO>&, \
1554 const Teuchos::ArrayView<const char>&, \
1555 const Teuchos::ArrayView<const size_t>&, \
1558 const CombineMode, \
1560 const Teuchos::ArrayView<const LO>&, \
1561 const Teuchos::ArrayView<const LO>&, \
1565 const Teuchos::ArrayView<size_t>&, \
1566 const Teuchos::ArrayView<GO>&, \
1567 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1568 const Teuchos::ArrayView<const int>&, \
1569 Teuchos::Array<int>&); \
1571 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1572 const CrsMatrix<ST, LO, GO, NT> &, \
1573 const Teuchos::ArrayView<const LO> &, \
1574 const Teuchos::ArrayView<const char> &, \
1575 const Teuchos::ArrayView<const size_t>&, \
1580 const Teuchos::ArrayView<const LO>&, \
1581 const Teuchos::ArrayView<const LO>&);
1583 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP