42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
55 #include "Kokkos_Core.hpp"
80 #ifndef DOXYGEN_SHOULD_SKIP_THIS
83 #endif // DOXYGEN_SHOULD_SKIP_THIS
90 namespace UnpackAndCombineCrsGraphImpl {
101 template<
class Packet,
class GO,
class Device,
class BufferDevice>
103 unpackRow(
typename Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
104 typename Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
105 const Kokkos::View<const Packet*,BufferDevice>& imports,
107 const size_t num_ent)
109 typedef typename Kokkos::View<GO*,Device>::size_type size_type;
117 for (size_type k=0; k<num_ent; k++)
118 gids_out(k) = imports(offset+k);
121 if (pids_out.size() > 0) {
122 for (size_type k=0; k<num_ent; k++)
123 pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
139 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
142 typedef Packet packet_type;
144 typedef LocalGraph local_graph_type;
145 typedef BufferDevice buffer_device_type;
147 typedef typename local_map_type::local_ordinal_type LO;
148 typedef typename local_map_type::global_ordinal_type GO;
151 typedef typename local_map_type::device_type device_type;
152 typedef typename device_type::execution_space execution_space;
154 typedef Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_type;
155 typedef Kokkos::View<const size_t*, device_type> offsets_type;
156 typedef Kokkos::View<const packet_type*, buffer_device_type> input_buffer_type;
157 typedef Kokkos::View<const LO*, device_type> import_lids_type;
159 typedef Kokkos::View<LO*, device_type> lids_scratch_type;
160 typedef Kokkos::View<GO*, device_type> gids_scratch_type;
161 typedef Kokkos::View<int*,device_type> pids_scratch_type;
163 static_assert(std::is_same<LO, typename local_graph_type::data_type>::value,
164 "LocalMap::local_ordinal_type and "
165 "LocalGraph::data_type must be the same.");
167 local_graph_type local_graph;
169 input_buffer_type imports;
170 num_packets_per_lid_type num_packets_per_lid;
171 import_lids_type import_lids;
172 offsets_type offsets;
177 Kokkos::Experimental::UniqueToken<execution_space,
178 Kokkos::Experimental::UniqueTokenScope::Global> tokens;
179 lids_scratch_type lids_scratch;
180 gids_scratch_type gids_scratch;
181 pids_scratch_type pids_scratch;
184 typedef Kokkos::pair<int, LO> value_type;
187 const local_graph_type& local_graph_in,
189 const input_buffer_type& imports_in,
190 const num_packets_per_lid_type& num_packets_per_lid_in,
191 const import_lids_type& import_lids_in,
192 const offsets_type& offsets_in,
194 const size_t max_num_ent_in,
195 const bool unpack_pids_in,
196 const bool atomic_in) :
197 local_graph(local_graph_in),
198 local_col_map(local_col_map_in),
200 num_packets_per_lid(num_packets_per_lid_in),
201 import_lids(import_lids_in),
203 combine_mode(combine_mode_in),
204 max_num_ent(max_num_ent_in),
205 unpack_pids(unpack_pids_in),
207 tokens(execution_space()),
208 lids_scratch(
"pids_scratch", tokens.size() * max_num_ent),
209 gids_scratch(
"gids_scratch", tokens.size() * max_num_ent),
210 pids_scratch(
"lids_scratch", tokens.size() * max_num_ent)
213 KOKKOS_INLINE_FUNCTION
void init(value_type& dst)
const
215 using Tpetra::Details::OrdinalTraits;
216 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
219 KOKKOS_INLINE_FUNCTION
void
220 join(
volatile value_type& dst,
const volatile value_type& src)
const
226 using Tpetra::Details::OrdinalTraits;
227 if (src.second != OrdinalTraits<LO>::invalid()) {
232 if (dst.second == OrdinalTraits<LO>::invalid() ||
233 src.second < dst.second) {
239 KOKKOS_INLINE_FUNCTION
240 void operator()(
const LO i, value_type& dst)
const
243 using Kokkos::subview;
244 using Kokkos::MemoryUnmanaged;
245 typedef typename execution_space::size_type size_type;
246 typedef typename Kokkos::pair<size_type, size_type> slice;
248 typedef View<LO*, device_type, MemoryUnmanaged> lids_out_type;
249 typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
250 typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
252 const size_t num_packets_this_lid = num_packets_per_lid(i);
253 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
254 : num_packets_this_lid;
255 if (unpack_pids && num_packets_this_lid%2 != 0) {
258 dst = Kokkos::make_pair(1, i);
268 const size_t buf_size = imports.size();
269 const size_t offset = offsets(i);
271 if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
272 dst = Kokkos::make_pair(2, i);
279 const size_type token = tokens.acquire();
280 const size_t a = static_cast<size_t>(token) * max_num_ent;
281 const size_t b = a + num_ent;
282 lids_out_type lids_out = subview(lids_scratch, slice(a, b));
283 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
284 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
287 int err = unpackRow<packet_type,GO,device_type,buffer_device_type>(
288 gids_out, pids_out, imports, offset, num_ent);
291 dst = Kokkos::make_pair(3, i);
298 for (
size_t k = 0; k < num_ent; ++k) {
302 tokens.release(token);
313 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
316 const LocalGraph& local_graph,
318 const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
319 const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
320 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
321 typename LocalMap::device_type,
322 Kokkos::MemoryUnmanaged>& import_lids,
324 const bool unpack_pids,
328 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
329 "unpackAndCombine[New] should not (yet) be called, the method is "
330 "incomplete. To complete, indices need to be inserted (unpacked) in to "
331 "the destination graph. The local graph, a Kokkos::StaticCrsGraph, does "
332 "not support insertion of indices");
334 typedef typename LocalMap::local_ordinal_type LO;
335 typedef typename LocalMap::device_type device_type;
336 typedef typename device_type::execution_space execution_space;
337 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
340 const char prefix[] =
341 "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
343 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
344 if (num_import_lids == 0) {
350 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
351 std::invalid_argument,
352 prefix <<
"INSERT combine mode is not allowed if the graph has a static graph "
353 "(i.e., was constructed with the CrsGraph constructor that takes a "
354 "const CrsGraph pointer).");
357 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode !=
REPLACE,
358 std::invalid_argument,
359 prefix <<
"Invalid combine mode; should never get "
360 "here! Please report this bug to the Tpetra developers.");
363 bool bad_num_import_lids =
364 num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
365 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
366 std::invalid_argument,
367 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
368 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
372 Kokkos::View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
379 Kokkos::parallel_reduce(
"MaxReduce",
380 num_packets_per_lid.size(),
381 KOKKOS_LAMBDA(
const int& i,
size_t& running_max_num_ent) {
382 size_t num_packets_this_lid = num_packets_per_lid(i);
383 size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
384 : num_packets_this_lid;
385 if (num_ent > running_max_num_ent) running_max_num_ent = num_ent;
386 }, Kokkos::Max<size_t>(max_num_ent));
389 unpack_functor_type f(local_graph, local_map,
390 imports, num_packets_per_lid, import_lids, offsets, combine_mode,
391 max_num_ent, unpack_pids, atomic);
393 typename unpack_functor_type::value_type x;
394 Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
395 auto x_h = x.to_std_pair();
396 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
397 prefix <<
"UnpackAndCombineFunctor reported error code "
398 << x_h.first <<
" for the first bad row " << x_h.second);
403 template<
class Packet,
class LocalGraph,
class BufferDevice>
406 const LocalGraph& local_graph,
407 const Kokkos::View<
const typename LocalGraph::data_type*,
408 typename LocalGraph::device_type,
409 Kokkos::MemoryUnmanaged> permute_from_lids,
410 const Kokkos::View<const Packet*, BufferDevice>& imports,
411 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
412 const size_t num_same_ids)
414 using Kokkos::parallel_reduce;
415 typedef LocalGraph local_graph_type;
416 typedef typename local_graph_type::data_type LO;
417 typedef typename local_graph_type::device_type device_type;
418 typedef typename device_type::execution_space execution_space;
419 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
425 num_items = static_cast<LO>(num_same_ids);
429 range_policy(0, num_items),
430 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
431 update += static_cast<size_t>(local_graph.row_map[lid+1]
432 -local_graph.row_map[lid]);
438 num_items = static_cast<LO>(permute_from_lids.extent(0));
442 range_policy(0, num_items),
443 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
444 const LO lid = permute_from_lids(i);
445 update += static_cast<size_t>(local_graph.row_map[lid+1]
446 - local_graph.row_map[lid]);
453 size_t tot_num_ent = 0;
454 Kokkos::parallel_reduce(
"SumReduce",
455 num_packets_per_lid.size(),
456 KOKKOS_LAMBDA(
const int& i,
size_t& lsum) {
457 lsum += num_packets_per_lid(i) / 2;
458 }, Kokkos::Sum<size_t>(tot_num_ent));
459 count += tot_num_ent;
466 template<
class Packet,
class LO,
class Device,
class BufferDevice>
469 const Kokkos::View<size_t*, Device>& tgt_rowptr,
470 const Kokkos::View<const LO*, Device>& import_lids,
471 const Kokkos::View<const Packet*, BufferDevice>& imports,
472 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
474 using Kokkos::parallel_reduce;
475 typedef Device device_type;
476 typedef typename device_type::execution_space execution_space;
477 typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
478 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
480 const size_type N = num_packets_per_lid.extent(0);
481 parallel_for(
"Setup row pointers for remotes",
483 KOKKOS_LAMBDA(
const size_t i){
484 typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
485 const size_t num_packets_this_lid = num_packets_per_lid(i);
486 const size_t num_ent = num_packets_this_lid / 2;
487 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
492 template<
class Device>
494 makeCrsRowPtrFromLengths(
495 const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
496 const Kokkos::View<size_t*,Device>& new_start_row)
498 using Kokkos::parallel_scan;
499 typedef Device device_type;
500 typedef typename device_type::execution_space execution_space;
501 typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
502 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
503 const size_type N = new_start_row.extent(0);
506 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
507 auto cur_val = tgt_rowptr(i);
509 tgt_rowptr(i) = update;
510 new_start_row(i) = tgt_rowptr(i);
517 template<
class LocalGraph,
class LocalMap>
520 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
521 typename LocalMap::device_type>& tgt_colind,
522 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
523 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
524 const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
525 const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
526 const LocalGraph& local_graph,
528 const size_t num_same_ids,
531 using Kokkos::parallel_for;
532 typedef typename LocalMap::device_type device_type;
533 typedef typename LocalMap::local_ordinal_type LO;
534 typedef typename device_type::execution_space execution_space;
535 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
538 range_policy(0, num_same_ids),
539 KOKKOS_LAMBDA(
const size_t i) {
540 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
542 const LO src_lid = static_cast<LO>(i);
543 size_t src_row = local_graph.row_map(src_lid);
545 const LO tgt_lid = static_cast<LO>(i);
546 const size_t tgt_row = tgt_rowptr(tgt_lid);
548 const size_t nsr = local_graph.row_map(src_lid+1)
549 - local_graph.row_map(src_lid);
550 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
552 for (
size_t j=local_graph.row_map(src_lid);
553 j<local_graph.row_map(src_lid+1); ++j) {
554 LO src_col = local_graph.entries(j);
555 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
556 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
562 template<
class LocalGraph,
class LocalMap>
564 copyDataFromPermuteIDs(
565 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
566 typename LocalMap::device_type>& tgt_colind,
567 const Kokkos::View<
int*,
568 typename LocalMap::device_type>& tgt_pids,
569 const Kokkos::View<
size_t*,
570 typename LocalMap::device_type>& new_start_row,
571 const Kokkos::View<
size_t*,
572 typename LocalMap::device_type>& tgt_rowptr,
573 const Kokkos::View<
const int*,
574 typename LocalMap::device_type>& src_pids,
575 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
576 typename LocalMap::device_type>& permute_to_lids,
577 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
578 typename LocalMap::device_type>& permute_from_lids,
579 const LocalGraph& local_graph,
583 using Kokkos::parallel_for;
584 typedef typename LocalMap::device_type device_type;
585 typedef typename LocalMap::local_ordinal_type LO;
586 typedef typename device_type::execution_space execution_space;
587 typedef typename Kokkos::View<LO*,device_type>::size_type size_type;
588 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
590 const size_type num_permute_to_lids = permute_to_lids.extent(0);
593 range_policy(0, num_permute_to_lids),
594 KOKKOS_LAMBDA(
const size_t i) {
595 typedef typename std::remove_reference<decltype(new_start_row(0)) >::type atomic_incr_type;
597 const LO src_lid = permute_from_lids(i);
598 const size_t src_row = local_graph.row_map(src_lid);
600 const LO tgt_lid = permute_to_lids(i);
601 const size_t tgt_row = tgt_rowptr(tgt_lid);
603 size_t nsr = local_graph.row_map(src_lid+1)
604 - local_graph.row_map(src_lid);
605 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
607 for (
size_t j=local_graph.row_map(src_lid);
608 j<local_graph.row_map(src_lid+1); ++j) {
609 LO src_col = local_graph.entries(j);
610 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
611 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
617 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
619 unpackAndCombineIntoCrsArrays2(
620 const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
621 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
622 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
623 const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
624 const Kokkos::View<const typename LocalMap::local_ordinal_type*, typename LocalMap::device_type>& import_lids,
625 const Kokkos::View<const Packet*, BufferDevice>& imports,
626 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
627 const LocalGraph& local_graph,
632 using Kokkos::subview;
633 using Kokkos::MemoryUnmanaged;
634 using Kokkos::parallel_reduce;
635 using Kokkos::atomic_fetch_add;
637 typedef Packet packet_type;
638 typedef BufferDevice buffer_device_type;
639 typedef typename LocalMap::device_type device_type;
640 typedef typename LocalMap::local_ordinal_type LO;
641 typedef typename LocalMap::global_ordinal_type GO;
642 typedef typename device_type::execution_space execution_space;
643 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
644 typedef typename Kokkos::pair<size_type, size_type> slice;
645 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
647 typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
648 typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
650 const size_type num_import_lids = import_lids.size();
651 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
655 parallel_reduce(
"Unpack and combine into CRS",
656 range_policy(0, num_import_lids),
657 KOKKOS_LAMBDA(
const size_t i,
int& err) {
658 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
659 const size_t num_packets_this_lid = num_packets_per_lid(i);
660 const size_t num_ent = num_packets_this_lid / 2;
661 const size_t offset = offsets(i);
662 const LO lcl_row = import_lids(i);
663 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
664 const size_t end_row = start_row + num_ent;
666 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
667 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
669 err += unpackRow<packet_type,GO,device_type,buffer_device_type>(
670 gids_out, pids_out, imports, offset, num_ent);
673 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
674 const int pid = pids_out(j);
675 pids_out(j) = (pid != my_pid) ? pid : -1;
679 TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
680 std::invalid_argument, prefix <<
681 "Attempting to unpack PIDs, but num_ent is not even; this should never "
682 "happen! Please report this bug to the Tpetra developers.");
687 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
690 const LocalGraph & local_graph,
692 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
693 typename LocalMap::device_type,
694 Kokkos::MemoryUnmanaged>& import_lids,
695 const Kokkos::View<const Packet*, BufferDevice>& imports,
696 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
697 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
698 typename LocalMap::device_type,
699 Kokkos::MemoryUnmanaged>& permute_to_lids,
700 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
701 typename LocalMap::device_type,
702 Kokkos::MemoryUnmanaged>& permute_from_lids,
703 const Kokkos::View<
size_t*,
704 typename LocalMap::device_type,
705 Kokkos::MemoryUnmanaged>& tgt_rowptr,
706 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
707 typename LocalMap::device_type,
708 Kokkos::MemoryUnmanaged>& tgt_colind,
709 const Kokkos::View<
const int*,
710 typename LocalMap::device_type,
711 Kokkos::MemoryUnmanaged>& src_pids,
712 const Kokkos::View<
int*,
713 typename LocalMap::device_type,
714 Kokkos::MemoryUnmanaged>& tgt_pids,
715 const size_t num_same_ids,
716 const size_t tgt_num_rows,
717 const size_t tgt_num_nonzeros,
718 const int my_tgt_pid)
721 using Kokkos::subview;
722 using Kokkos::parallel_for;
723 using Kokkos::MemoryUnmanaged;
724 typedef Packet packet_type;
726 typedef LocalGraph local_graph_type;
727 typedef BufferDevice buffer_device_type;
728 typedef typename LocalMap::device_type device_type;
729 typedef typename LocalMap::local_ordinal_type LO;
730 typedef typename device_type::execution_space execution_space;
731 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
732 typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
734 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
736 const size_t N = tgt_num_rows;
737 const size_t mynnz = tgt_num_nonzeros;
741 const int my_pid = my_tgt_pid;
745 range_policy(0, N+1),
746 KOKKOS_LAMBDA(
const size_t i) {
753 range_policy(0, num_same_ids),
754 KOKKOS_LAMBDA(
const size_t i) {
755 const LO tgt_lid = static_cast<LO>(i);
756 const LO src_lid = static_cast<LO>(i);
757 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
758 - local_graph.row_map(src_lid);
763 const size_type num_permute_to_lids = permute_to_lids.extent(0);
765 range_policy(0, num_permute_to_lids),
766 KOKKOS_LAMBDA(
const size_t i) {
767 const LO tgt_lid = permute_to_lids(i);
768 const LO src_lid = permute_from_lids(i);
769 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
770 - local_graph.row_map(src_lid);
775 const size_type num_import_lids = import_lids.extent(0);
776 View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
779 #ifdef HAVE_TPETRA_DEBUG
781 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
782 const bool condition =
783 nth_offset_h != static_cast<size_t>(imports.extent(0));
784 TEUCHOS_TEST_FOR_EXCEPTION
785 (condition, std::logic_error, prefix
786 <<
"The final offset in bytes " << nth_offset_h
787 <<
" != imports.size() = " << imports.extent(0)
788 <<
". Please report this bug to the Tpetra developers.");
790 #endif // HAVE_TPETRA_DEBUG
793 setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
794 tgt_rowptr, import_lids, imports, num_packets_per_lid);
798 View<size_t*, device_type> new_start_row(
"new_start_row", N+1);
801 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
803 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
804 bool condition = nth_tgt_rowptr_h != mynnz;
805 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
806 prefix <<
"CRS_rowptr[last] = " <<
807 nth_tgt_rowptr_h <<
"!= mynnz = " << mynnz <<
".");
811 copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
812 tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
814 copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
815 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
816 local_graph, local_col_map, my_pid);
818 if (imports.extent(0) <= 0) {
822 unpackAndCombineIntoCrsArrays2<
823 packet_type,local_graph_type,local_map_type,buffer_device_type>(
824 tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
825 num_packets_per_lid, local_graph, local_col_map, my_pid);
868 template<
class LO,
class GO,
class Node>
873 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
874 const Teuchos::ArrayView<const LO>& importLIDs,
875 size_t constantNumPackets,
881 typedef typename Node::device_type device_type;
885 static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
886 "Node::device_type and LocalGraph::device_type must be the same.");
888 typedef typename device_type::execution_space execution_space;
889 typename execution_space::device_type outputDevice;
891 typedef typename buffer_device_type::execution_space buffer_execution_space;
892 typename buffer_execution_space::device_type bufferOutputDevice;
901 imports.getRawPtr(), imports.size(),
904 auto num_packets_per_lid_d =
906 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
907 true,
"num_packets_per_lid");
911 importLIDs.getRawPtr(), importLIDs.size(),
912 true,
"import_lids");
915 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
918 typedef decltype(local_col_map) local_map_type;
919 UnpackAndCombineCrsGraphImpl::unpackAndCombine<
920 packet_type,local_graph_type,local_map_type,buffer_device_type>(
921 local_graph, local_col_map, imports_d, num_packets_per_lid_d,
922 import_lids_d, combineMode,
false, atomic);
927 template<
class LO,
class GO,
class Node>
929 unpackCrsGraphAndCombineNew(
933 const Kokkos::DualView<
const size_t*,
935 const Kokkos::DualView<const LO*, typename Node::device_type>& importLIDs,
936 const size_t constantNumPackets,
943 typedef typename Node::device_type device_type;
945 typedef typename crs_graph_type::packet_type packet_type;
946 typedef typename crs_graph_type::local_graph_type local_graph_type;
947 typedef typename crs_graph_type::buffer_device_type buffer_device_type;
948 typedef typename buffer_device_type::memory_space buffer_memory_space;
949 typedef typename device_type::memory_space memory_space;
951 static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
952 "Node::device_type and LocalGraph::device_type must be "
957 numPacketsPerLID_nc.template sync<buffer_memory_space>();
959 auto num_packets_per_lid_d = numPacketsPerLID.template view<buffer_memory_space>();
963 importLIDs_nc.template sync<memory_space>();
965 auto import_lids_d = importLIDs.template view<memory_space>();
969 imports_nc.template sync<buffer_memory_space>();
971 auto imports_d = imports.template view<buffer_memory_space>();
974 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
975 typedef decltype(local_col_map) local_map_type;
978 UnpackAndCombineCrsGraphImpl::unpackAndCombine<
979 packet_type,local_graph_type,local_map_type,buffer_device_type>(
980 local_graph, local_col_map, imports_d, num_packets_per_lid_d,
981 import_lids_d, combineMode,
false, atomic);
1036 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1040 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1042 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1043 size_t constantNumPackets,
1047 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1048 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1050 using Kokkos::MemoryUnmanaged;
1052 typedef typename Node::device_type device_type;
1056 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1058 TEUCHOS_TEST_FOR_EXCEPTION
1059 (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1060 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size() <<
" != "
1061 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1065 TEUCHOS_TEST_FOR_EXCEPTION
1066 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1067 "CrsGraph 'sourceGraph' must be locally indexed.");
1068 TEUCHOS_TEST_FOR_EXCEPTION
1069 (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1070 prefix <<
"importLIDs.size() = " << importLIDs.size() <<
" != "
1071 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1074 auto permute_from_lids_d =
1076 permuteFromLIDs.getRawPtr(),
1077 permuteFromLIDs.size(),
true,
1078 "permute_from_lids");
1081 imports.getRawPtr(),
1082 imports.size(),
true,
1084 auto num_packets_per_lid_d =
1086 numPacketsPerLID.getRawPtr(),
1087 numPacketsPerLID.size(),
true,
1088 "num_packets_per_lid");
1091 packet_type,local_graph_type,buffer_device_type>(
1092 local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1108 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1112 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1114 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1115 const size_t constantNumPackets,
1118 const size_t numSameIDs,
1119 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1120 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1121 size_t TargetNumRows,
1122 size_t TargetNumNonzeros,
1123 const int MyTargetPID,
1124 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1125 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1126 const Teuchos::ArrayView<const int>& SourcePids,
1127 Teuchos::Array<int>& TargetPids)
1131 using Teuchos::ArrayView;
1132 using Teuchos::outArg;
1133 using Teuchos::REDUCE_MAX;
1134 using Teuchos::reduceAll;
1135 typedef LocalOrdinal LO;
1139 typedef typename Node::device_type device_type;
1140 typedef typename device_type::execution_space execution_space;
1141 typedef typename buffer_device_type::execution_space buffer_execution_space;
1142 typedef typename ArrayView<const LO>::size_type size_type;
1144 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1146 TEUCHOS_TEST_FOR_EXCEPTION(
1147 TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1148 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1149 CRS_rowptr.size() <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1151 TEUCHOS_TEST_FOR_EXCEPTION(
1152 permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1153 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size()
1154 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1155 const size_type numImportLIDs = importLIDs.size();
1157 TEUCHOS_TEST_FOR_EXCEPTION(
1158 numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1159 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != "
1160 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1163 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1164 TargetPids.resize(TargetNumNonzeros);
1166 TargetPids.assign(TargetNumNonzeros, -1);
1170 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
1173 typename execution_space::device_type outputDevice;
1174 typename buffer_execution_space::device_type bufferOutputDevice;
1177 importLIDs.getRawPtr(), importLIDs.size(),
1178 true,
"import_lids");
1181 imports.getRawPtr(), imports.size(),
1185 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1186 true,
"num_packets_per_lid");
1189 permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1190 true,
"permute_from_lids");
1193 permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1194 true,
"permute_to_lids");
1197 CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1198 true,
"crs_rowptr");
1201 CRS_colind.getRawPtr(), CRS_colind.size(),
1202 true,
"crs_colidx");
1205 SourcePids.getRawPtr(), SourcePids.size(),
1209 TargetPids.getRawPtr(), TargetPids.size(),
1212 typedef decltype(local_col_map) local_map_type;
1214 packet_type,local_graph_type,local_map_type,buffer_device_type>(
1215 local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1216 permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1217 tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1220 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1221 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1224 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1225 CRS_colind.getRawPtr(), CRS_colind.size());
1228 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1229 TargetPids.getRawPtr(), TargetPids.size());
1237 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1239 Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1240 const CrsGraph<LO, GO, NT>&, \
1241 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1242 const Teuchos::ArrayView<const size_t>&, \
1243 const Teuchos::ArrayView<const LO>&, \
1249 Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1250 const CrsGraph<LO, GO, NT>&, \
1251 const Kokkos::DualView<const typename CrsGraph<LO,GO,NT>::packet_type*, \
1252 typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1253 const Kokkos::DualView<const size_t*, typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1254 const Kokkos::DualView<const LO*, NT::device_type>&, \
1257 const CombineMode, \
1260 Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1261 const CrsGraph<LO, GO, NT> &, \
1262 const Teuchos::ArrayView<const LO>&, \
1263 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1264 const Teuchos::ArrayView<const size_t>&, \
1267 const CombineMode, \
1269 const Teuchos::ArrayView<const LO>&, \
1270 const Teuchos::ArrayView<const LO>&, \
1274 const Teuchos::ArrayView<size_t>&, \
1275 const Teuchos::ArrayView<GO>&, \
1276 const Teuchos::ArrayView<const int>&, \
1277 Teuchos::Array<int>&); \
1279 Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1280 const CrsGraph<LO, GO, NT> &, \
1281 const Teuchos::ArrayView<const LO> &, \
1282 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1283 const Teuchos::ArrayView<const size_t>&, \
1288 const Teuchos::ArrayView<const LO>&, \
1289 const Teuchos::ArrayView<const LO>&);
1291 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP