Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
53 #include "Tpetra_CrsGraph_decl.hpp"
55 #include "Kokkos_Core.hpp"
56 #include <memory>
57 #include <string>
58 
77 
78 namespace Tpetra {
79 
80 #ifndef DOXYGEN_SHOULD_SKIP_THIS
81 // Forward declaration of Distributor
82 class Distributor;
83 #endif // DOXYGEN_SHOULD_SKIP_THIS
84 
85 //
86 // Users must never rely on anything in the Details namespace.
87 //
88 namespace Details {
89 
90 namespace UnpackAndCombineCrsGraphImpl {
91 
101 template<class Packet, class GO, class Device, class BufferDevice>
102 KOKKOS_FUNCTION int
103 unpackRow(typename Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
104  typename Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
105  const Kokkos::View<const Packet*,BufferDevice>& imports,
106  const size_t offset,
107  const size_t num_ent)
108 {
109  typedef typename Kokkos::View<GO*,Device>::size_type size_type;
110 
111  if (num_ent == 0) {
112  // Empty rows always take zero bytes, to ensure sparsity.
113  return 0;
114  }
115 
116  // Unpack GIDs
117  for (size_type k=0; k<num_ent; k++)
118  gids_out(k) = imports(offset+k);
119 
120  // Unpack PIDs
121  if (pids_out.size() > 0) {
122  for (size_type k=0; k<num_ent; k++)
123  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
124  }
125 
126  return 0;
127 }
128 
139 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
141 
142  typedef Packet packet_type;
143  typedef LocalMap local_map_type;
144  typedef LocalGraph local_graph_type;
145  typedef BufferDevice buffer_device_type;
146 
147  typedef typename local_map_type::local_ordinal_type LO;
148  typedef typename local_map_type::global_ordinal_type GO;
149  // Kokkos::parallel_reduce fails to compile if named device_type and typedef
150  // is public
151  typedef typename local_map_type::device_type device_type;
152  typedef typename device_type::execution_space execution_space;
153 
154  typedef Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_type;
155  typedef Kokkos::View<const size_t*, device_type> offsets_type;
156  typedef Kokkos::View<const packet_type*, buffer_device_type> input_buffer_type;
157  typedef Kokkos::View<const LO*, device_type> import_lids_type;
158 
159  typedef Kokkos::View<LO*, device_type> lids_scratch_type;
160  typedef Kokkos::View<GO*, device_type> gids_scratch_type;
161  typedef Kokkos::View<int*,device_type> pids_scratch_type;
162 
163  static_assert(std::is_same<LO, typename local_graph_type::data_type>::value,
164  "LocalMap::local_ordinal_type and "
165  "LocalGraph::data_type must be the same.");
166 
167  local_graph_type local_graph;
168  local_map_type local_col_map;
169  input_buffer_type imports;
170  num_packets_per_lid_type num_packets_per_lid;
171  import_lids_type import_lids;
172  offsets_type offsets;
173  Tpetra::CombineMode combine_mode;
174  size_t max_num_ent;
175  bool unpack_pids;
176  bool atomic;
177  Kokkos::Experimental::UniqueToken<execution_space,
178  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
179  lids_scratch_type lids_scratch;
180  gids_scratch_type gids_scratch;
181  pids_scratch_type pids_scratch;
182 
183  public:
184  typedef Kokkos::pair<int, LO> value_type;
185 
187  const local_graph_type& local_graph_in,
188  const local_map_type& local_col_map_in,
189  const input_buffer_type& imports_in,
190  const num_packets_per_lid_type& num_packets_per_lid_in,
191  const import_lids_type& import_lids_in,
192  const offsets_type& offsets_in,
193  const Tpetra::CombineMode combine_mode_in,
194  const size_t max_num_ent_in,
195  const bool unpack_pids_in,
196  const bool atomic_in) :
197  local_graph(local_graph_in),
198  local_col_map(local_col_map_in),
199  imports(imports_in),
200  num_packets_per_lid(num_packets_per_lid_in),
201  import_lids(import_lids_in),
202  offsets(offsets_in),
203  combine_mode(combine_mode_in),
204  max_num_ent(max_num_ent_in),
205  unpack_pids(unpack_pids_in),
206  atomic(atomic_in),
207  tokens(execution_space()),
208  lids_scratch("pids_scratch", tokens.size() * max_num_ent),
209  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
210  pids_scratch("lids_scratch", tokens.size() * max_num_ent)
211  {}
212 
213  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
214  {
215  using Tpetra::Details::OrdinalTraits;
216  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
217  }
218 
219  KOKKOS_INLINE_FUNCTION void
220  join(volatile value_type& dst, const volatile value_type& src) const
221  {
222  // `dst` should reflect the first (least) bad index and
223  // all other associated error codes and data. Thus, we need only
224  // check if the `src` object shows an error and if its associated
225  // bad index is less than `dst`'s bad index.
226  using Tpetra::Details::OrdinalTraits;
227  if (src.second != OrdinalTraits<LO>::invalid()) {
228  // An error in the src; check if
229  // 1. `dst` shows errors
230  // 2. If `dst` does show errors, if src's bad index is less than
231  // *this' bad index
232  if (dst.second == OrdinalTraits<LO>::invalid() ||
233  src.second < dst.second) {
234  dst = src;
235  }
236  }
237  }
238 
239  KOKKOS_INLINE_FUNCTION
240  void operator()(const LO i, value_type& dst) const
241  {
242  using Kokkos::View;
243  using Kokkos::subview;
244  using Kokkos::MemoryUnmanaged;
245  typedef typename execution_space::size_type size_type;
246  typedef typename Kokkos::pair<size_type, size_type> slice;
247 
248  typedef View<LO*, device_type, MemoryUnmanaged> lids_out_type;
249  typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
250  typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
251 
252  const size_t num_packets_this_lid = num_packets_per_lid(i);
253  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
254  : num_packets_this_lid;
255  if (unpack_pids && num_packets_this_lid%2 != 0) {
256  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
257  // should never
258  dst = Kokkos::make_pair(1, i);
259  return;
260  }
261 
262  // Only unpack data if there is a nonzero number to unpack
263  if (num_ent == 0) {
264  return;
265  }
266 
267  // there is actually something in the row
268  const size_t buf_size = imports.size();
269  const size_t offset = offsets(i);
270 
271  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
272  dst = Kokkos::make_pair(2, i); // out of bounds
273  return;
274  }
275 
276  // Get subviews in to the scratch arrays. The token returned from acquire
277  // is an integer in [0, tokens.size()). It is used to grab a unique (to
278  // this thread) subview of the scratch arrays.
279  const size_type token = tokens.acquire();
280  const size_t a = static_cast<size_t>(token) * max_num_ent;
281  const size_t b = a + num_ent;
282  lids_out_type lids_out = subview(lids_scratch, slice(a, b));
283  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
284  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
285 
286  // Unpack this row!
287  int err = unpackRow<packet_type,GO,device_type,buffer_device_type>(
288  gids_out, pids_out, imports, offset, num_ent);
289 
290  if (err != 0) {
291  dst = Kokkos::make_pair(3, i);
292  return;
293  }
294 
295  // Column indices come in as global indices, in case the
296  // source object's column Map differs from the target object's
297  // (this's) column Map, and must be converted local index values
298  for (size_t k = 0; k < num_ent; ++k) {
299  lids_out(k) = local_col_map.getLocalElement(gids_out(k));
300  }
301 
302  tokens.release(token);
303  }
304 };
305 
313 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
314 void
316  const LocalGraph& local_graph,
317  const LocalMap& local_map,
318  const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
319  const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
320  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
321  typename LocalMap::device_type,
322  Kokkos::MemoryUnmanaged>& import_lids,
323  const Tpetra::CombineMode combine_mode,
324  const bool unpack_pids,
325  const bool atomic)
326 {
327 
328  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
329  "unpackAndCombine[New] should not (yet) be called, the method is "
330  "incomplete. To complete, indices need to be inserted (unpacked) in to "
331  "the destination graph. The local graph, a Kokkos::StaticCrsGraph, does "
332  "not support insertion of indices");
333 
334  typedef typename LocalMap::local_ordinal_type LO;
335  typedef typename LocalMap::device_type device_type;
336  typedef typename device_type::execution_space execution_space;
337  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
339 
340  const char prefix[] =
341  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
342 
343  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
344  if (num_import_lids == 0) {
345  // Nothing to unpack
346  return;
347  }
348 
349  {
350  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
351  std::invalid_argument,
352  prefix << "INSERT combine mode is not allowed if the graph has a static graph "
353  "(i.e., was constructed with the CrsGraph constructor that takes a "
354  "const CrsGraph pointer).");
355 
356  // Unknown combine mode!
357  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode != REPLACE,
358  std::invalid_argument,
359  prefix << "Invalid combine mode; should never get "
360  "here! Please report this bug to the Tpetra developers.");
361 
362  // Check that sizes of input objects are consistent.
363  bool bad_num_import_lids =
364  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
365  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
366  std::invalid_argument,
367  prefix << "importLIDs.size() (" << num_import_lids << ") != "
368  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
369  } // end QA error checking
370 
371  // Get the offsets
372  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
373  computeOffsetsFromCounts(offsets, num_packets_per_lid);
374 
375  // Determine the maximum number of entries in any row in the graph. The
376  // maximum number of entries is needed to allocate unpack buffers on the
377  // device.
378  size_t max_num_ent;
379  Kokkos::parallel_reduce("MaxReduce",
380  num_packets_per_lid.size(),
381  KOKKOS_LAMBDA(const int& i, size_t& running_max_num_ent) {
382  size_t num_packets_this_lid = num_packets_per_lid(i);
383  size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
384  : num_packets_this_lid;
385  if (num_ent > running_max_num_ent) running_max_num_ent = num_ent;
386  }, Kokkos::Max<size_t>(max_num_ent));
387 
388  // Now do the actual unpack!
389  unpack_functor_type f(local_graph, local_map,
390  imports, num_packets_per_lid, import_lids, offsets, combine_mode,
391  max_num_ent, unpack_pids, atomic);
392 
393  typename unpack_functor_type::value_type x;
394  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
395  auto x_h = x.to_std_pair();
396  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
397  prefix << "UnpackAndCombineFunctor reported error code "
398  << x_h.first << " for the first bad row " << x_h.second);
399 
400  return;
401 }
402 
403 template<class Packet, class LocalGraph, class BufferDevice>
404 size_t
406  const LocalGraph& local_graph,
407  const Kokkos::View<const typename LocalGraph::data_type*,
408  typename LocalGraph::device_type,
409  Kokkos::MemoryUnmanaged> permute_from_lids,
410  const Kokkos::View<const Packet*, BufferDevice>& imports,
411  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
412  const size_t num_same_ids)
413 {
414  using Kokkos::parallel_reduce;
415  typedef LocalGraph local_graph_type;
416  typedef typename local_graph_type::data_type LO;
417  typedef typename local_graph_type::device_type device_type;
418  typedef typename device_type::execution_space execution_space;
419  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO> > range_policy;
420 
421  size_t count = 0;
422  LO num_items;
423 
424  // Number of graph entries to unpack (returned by this function).
425  num_items = static_cast<LO>(num_same_ids);
426  if (num_items) {
427  size_t kcnt = 0;
428  parallel_reduce(
429  range_policy(0, num_items),
430  KOKKOS_LAMBDA(const LO lid, size_t& update) {
431  update += static_cast<size_t>(local_graph.row_map[lid+1]
432  -local_graph.row_map[lid]);
433  }, kcnt);
434  count += kcnt;
435  }
436 
437  // Count entries copied directly from the source graph with permuting.
438  num_items = static_cast<LO>(permute_from_lids.extent(0));
439  if (num_items) {
440  size_t kcnt = 0;
441  parallel_reduce(
442  range_policy(0, num_items),
443  KOKKOS_LAMBDA(const LO i, size_t& update) {
444  const LO lid = permute_from_lids(i);
445  update += static_cast<size_t>(local_graph.row_map[lid+1]
446  - local_graph.row_map[lid]);
447  }, kcnt);
448  count += kcnt;
449  }
450 
451  {
452  // Count entries received from other MPI processes.
453  size_t tot_num_ent = 0;
454  Kokkos::parallel_reduce("SumReduce",
455  num_packets_per_lid.size(),
456  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
457  lsum += num_packets_per_lid(i) / 2;
458  }, Kokkos::Sum<size_t>(tot_num_ent));
459  count += tot_num_ent;
460  }
461 
462  return count;
463 }
464 
466 template<class Packet, class LO, class Device, class BufferDevice>
467 void
469  const Kokkos::View<size_t*, Device>& tgt_rowptr,
470  const Kokkos::View<const LO*, Device>& import_lids,
471  const Kokkos::View<const Packet*, BufferDevice>& imports,
472  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
473 {
474  using Kokkos::parallel_reduce;
475  typedef Device device_type;
476  typedef typename device_type::execution_space execution_space;
477  typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
478  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
479 
480  const size_type N = num_packets_per_lid.extent(0);
481  parallel_for("Setup row pointers for remotes",
482  range_policy(0, N),
483  KOKKOS_LAMBDA(const size_t i){
484  typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
485  const size_t num_packets_this_lid = num_packets_per_lid(i);
486  const size_t num_ent = num_packets_this_lid / 2;
487  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
488  });
489 }
490 
491 // Convert array of row lengths to a CRS pointer array
492 template<class Device>
493 void
494 makeCrsRowPtrFromLengths(
495  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
496  const Kokkos::View<size_t*,Device>& new_start_row)
497 {
498  using Kokkos::parallel_scan;
499  typedef Device device_type;
500  typedef typename device_type::execution_space execution_space;
501  typedef typename Kokkos::View<size_t*,device_type>::size_type size_type;
502  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
503  const size_type N = new_start_row.extent(0);
504  parallel_scan(
505  range_policy(0, N),
506  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
507  auto cur_val = tgt_rowptr(i);
508  if (final) {
509  tgt_rowptr(i) = update;
510  new_start_row(i) = tgt_rowptr(i);
511  }
512  update += cur_val;
513  }
514  );
515 }
516 
517 template<class LocalGraph, class LocalMap>
518 void
519 copyDataFromSameIDs(
520  const Kokkos::View<typename LocalMap::global_ordinal_type*,
521  typename LocalMap::device_type>& tgt_colind,
522  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
523  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
524  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
525  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
526  const LocalGraph& local_graph,
527  const LocalMap& local_col_map,
528  const size_t num_same_ids,
529  const int my_pid)
530 {
531  using Kokkos::parallel_for;
532  typedef typename LocalMap::device_type device_type;
533  typedef typename LocalMap::local_ordinal_type LO;
534  typedef typename device_type::execution_space execution_space;
535  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
536 
537  parallel_for(
538  range_policy(0, num_same_ids),
539  KOKKOS_LAMBDA(const size_t i) {
540  typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
541 
542  const LO src_lid = static_cast<LO>(i);
543  size_t src_row = local_graph.row_map(src_lid);
544 
545  const LO tgt_lid = static_cast<LO>(i);
546  const size_t tgt_row = tgt_rowptr(tgt_lid);
547 
548  const size_t nsr = local_graph.row_map(src_lid+1)
549  - local_graph.row_map(src_lid);
550  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
551 
552  for (size_t j=local_graph.row_map(src_lid);
553  j<local_graph.row_map(src_lid+1); ++j) {
554  LO src_col = local_graph.entries(j);
555  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
556  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
557  }
558  }
559  );
560 }
561 
562 template<class LocalGraph, class LocalMap>
563 void
564 copyDataFromPermuteIDs(
565  const Kokkos::View<typename LocalMap::global_ordinal_type*,
566  typename LocalMap::device_type>& tgt_colind,
567  const Kokkos::View<int*,
568  typename LocalMap::device_type>& tgt_pids,
569  const Kokkos::View<size_t*,
570  typename LocalMap::device_type>& new_start_row,
571  const Kokkos::View<size_t*,
572  typename LocalMap::device_type>& tgt_rowptr,
573  const Kokkos::View<const int*,
574  typename LocalMap::device_type>& src_pids,
575  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
576  typename LocalMap::device_type>& permute_to_lids,
577  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
578  typename LocalMap::device_type>& permute_from_lids,
579  const LocalGraph& local_graph,
580  const LocalMap& local_col_map,
581  const int my_pid)
582 {
583  using Kokkos::parallel_for;
584  typedef typename LocalMap::device_type device_type;
585  typedef typename LocalMap::local_ordinal_type LO;
586  typedef typename device_type::execution_space execution_space;
587  typedef typename Kokkos::View<LO*,device_type>::size_type size_type;
588  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
589 
590  const size_type num_permute_to_lids = permute_to_lids.extent(0);
591 
592  parallel_for(
593  range_policy(0, num_permute_to_lids),
594  KOKKOS_LAMBDA(const size_t i) {
595  typedef typename std::remove_reference<decltype(new_start_row(0)) >::type atomic_incr_type;
596 
597  const LO src_lid = permute_from_lids(i);
598  const size_t src_row = local_graph.row_map(src_lid);
599 
600  const LO tgt_lid = permute_to_lids(i);
601  const size_t tgt_row = tgt_rowptr(tgt_lid);
602 
603  size_t nsr = local_graph.row_map(src_lid+1)
604  - local_graph.row_map(src_lid);
605  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
606 
607  for (size_t j=local_graph.row_map(src_lid);
608  j<local_graph.row_map(src_lid+1); ++j) {
609  LO src_col = local_graph.entries(j);
610  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
611  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
612  }
613  }
614  );
615 }
616 
617 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
618 void
619 unpackAndCombineIntoCrsArrays2(
620  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
621  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
622  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
623  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
624  const Kokkos::View<const typename LocalMap::local_ordinal_type*, typename LocalMap::device_type>& import_lids,
625  const Kokkos::View<const Packet*, BufferDevice>& imports,
626  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
627  const LocalGraph& local_graph,
628  const LocalMap /*& local_col_map*/,
629  const int my_pid)
630 {
631  using Kokkos::View;
632  using Kokkos::subview;
633  using Kokkos::MemoryUnmanaged;
634  using Kokkos::parallel_reduce;
635  using Kokkos::atomic_fetch_add;
636 
637  typedef Packet packet_type;
638  typedef BufferDevice buffer_device_type;
639  typedef typename LocalMap::device_type device_type;
640  typedef typename LocalMap::local_ordinal_type LO;
641  typedef typename LocalMap::global_ordinal_type GO;
642  typedef typename device_type::execution_space execution_space;
643  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
644  typedef typename Kokkos::pair<size_type, size_type> slice;
645  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type> > range_policy;
646 
647  typedef View<int*,device_type, MemoryUnmanaged> pids_out_type;
648  typedef View<GO*, device_type, MemoryUnmanaged> gids_out_type;
649 
650  const size_type num_import_lids = import_lids.size();
651  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
652 
653  // RemoteIDs: Loop structure following UnpackAndCombine
654  int gbl_err_count;
655  parallel_reduce("Unpack and combine into CRS",
656  range_policy(0, num_import_lids),
657  KOKKOS_LAMBDA(const size_t i, int& err) {
658  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
659  const size_t num_packets_this_lid = num_packets_per_lid(i);
660  const size_t num_ent = num_packets_this_lid / 2;
661  const size_t offset = offsets(i);
662  const LO lcl_row = import_lids(i);
663  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
664  const size_t end_row = start_row + num_ent;
665 
666  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
667  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
668 
669  err += unpackRow<packet_type,GO,device_type,buffer_device_type>(
670  gids_out, pids_out, imports, offset, num_ent);
671 
672  // Correct target PIDs.
673  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
674  const int pid = pids_out(j);
675  pids_out(j) = (pid != my_pid) ? pid : -1;
676  }
677  }, gbl_err_count);
678 
679  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
680  std::invalid_argument, prefix <<
681  "Attempting to unpack PIDs, but num_ent is not even; this should never "
682  "happen! Please report this bug to the Tpetra developers.");
683 
684  return;
685 }
686 
687 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
688 void
690  const LocalGraph & local_graph,
691  const LocalMap & local_col_map,
692  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
693  typename LocalMap::device_type,
694  Kokkos::MemoryUnmanaged>& import_lids,
695  const Kokkos::View<const Packet*, BufferDevice>& imports,
696  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
697  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
698  typename LocalMap::device_type,
699  Kokkos::MemoryUnmanaged>& permute_to_lids,
700  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
701  typename LocalMap::device_type,
702  Kokkos::MemoryUnmanaged>& permute_from_lids,
703  const Kokkos::View<size_t*,
704  typename LocalMap::device_type,
705  Kokkos::MemoryUnmanaged>& tgt_rowptr,
706  const Kokkos::View<typename LocalMap::global_ordinal_type*,
707  typename LocalMap::device_type,
708  Kokkos::MemoryUnmanaged>& tgt_colind,
709  const Kokkos::View<const int*,
710  typename LocalMap::device_type,
711  Kokkos::MemoryUnmanaged>& src_pids,
712  const Kokkos::View<int*,
713  typename LocalMap::device_type,
714  Kokkos::MemoryUnmanaged>& tgt_pids,
715  const size_t num_same_ids,
716  const size_t tgt_num_rows,
717  const size_t tgt_num_nonzeros,
718  const int my_tgt_pid)
719 {
720  using Kokkos::View;
721  using Kokkos::subview;
722  using Kokkos::parallel_for;
723  using Kokkos::MemoryUnmanaged;
724  typedef Packet packet_type;
725  typedef LocalMap local_map_type;
726  typedef LocalGraph local_graph_type;
727  typedef BufferDevice buffer_device_type;
728  typedef typename LocalMap::device_type device_type;
729  typedef typename LocalMap::local_ordinal_type LO;
730  typedef typename device_type::execution_space execution_space;
731  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
732  typedef Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t> > range_policy;
733 
734  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
735 
736  const size_t N = tgt_num_rows;
737  const size_t mynnz = tgt_num_nonzeros;
738 
739  // In the case of reduced communicators, the sourceGraph won't have
740  // the right "my_pid", so thus we have to supply it.
741  const int my_pid = my_tgt_pid;
742 
743  // Zero the rowptr
744  parallel_for(
745  range_policy(0, N+1),
746  KOKKOS_LAMBDA(const size_t i) {
747  tgt_rowptr(i) = 0;
748  }
749  );
750 
751  // same IDs: Always first, always in the same place
752  parallel_for(
753  range_policy(0, num_same_ids),
754  KOKKOS_LAMBDA(const size_t i) {
755  const LO tgt_lid = static_cast<LO>(i);
756  const LO src_lid = static_cast<LO>(i);
757  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
758  - local_graph.row_map(src_lid);
759  }
760  );
761 
762  // Permute IDs: Still local, but reordered
763  const size_type num_permute_to_lids = permute_to_lids.extent(0);
764  parallel_for(
765  range_policy(0, num_permute_to_lids),
766  KOKKOS_LAMBDA(const size_t i) {
767  const LO tgt_lid = permute_to_lids(i);
768  const LO src_lid = permute_from_lids(i);
769  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
770  - local_graph.row_map(src_lid);
771  }
772  );
773 
774  // Get the offsets from the number of packets per LID
775  const size_type num_import_lids = import_lids.extent(0);
776  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
777  computeOffsetsFromCounts(offsets, num_packets_per_lid);
778 
779 #ifdef HAVE_TPETRA_DEBUG
780  {
781  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
782  const bool condition =
783  nth_offset_h != static_cast<size_t>(imports.extent(0));
784  TEUCHOS_TEST_FOR_EXCEPTION
785  (condition, std::logic_error, prefix
786  << "The final offset in bytes " << nth_offset_h
787  << " != imports.size() = " << imports.extent(0)
788  << ". Please report this bug to the Tpetra developers.");
789  }
790 #endif // HAVE_TPETRA_DEBUG
791 
792  // Setup row pointers for remotes
793  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
794  tgt_rowptr, import_lids, imports, num_packets_per_lid);
795 
796  // If multiple processes contribute to the same row, we may need to
797  // update row offsets. This tracks that.
798  View<size_t*, device_type> new_start_row("new_start_row", N+1);
799 
800  // Turn row length into a real CRS row pointer
801  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
802  {
803  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
804  bool condition = nth_tgt_rowptr_h != mynnz;
805  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
806  prefix << "CRS_rowptr[last] = " <<
807  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
808  }
809 
810  // SameIDs: Copy the data over
811  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
812  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
813 
814  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
815  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
816  local_graph, local_col_map, my_pid);
817 
818  if (imports.extent(0) <= 0) {
819  return;
820  }
821 
822  unpackAndCombineIntoCrsArrays2<
823  packet_type,local_graph_type,local_map_type,buffer_device_type>(
824  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
825  num_packets_per_lid, local_graph, local_col_map, my_pid);
826 
827  return;
828 }
829 
830 } // namespace UnpackAndCombineCrsGraphImpl
831 
868 template<class LO, class GO, class Node>
869 void
871  const CrsGraph<LO, GO, Node>& sourceGraph,
872  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,Node>::packet_type>& imports,
873  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
874  const Teuchos::ArrayView<const LO>& importLIDs,
875  size_t constantNumPackets,
876  Distributor & distor,
877  CombineMode combineMode,
878  const bool atomic)
879 {
880  using Kokkos::View;
881  typedef typename Node::device_type device_type;
882  typedef typename CrsGraph<LO,GO,Node>::packet_type packet_type;
883  typedef typename CrsGraph<LO, GO, Node>::local_graph_type local_graph_type;
884  typedef typename CrsGraph<LO, GO, Node>::buffer_device_type buffer_device_type;
885  static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
886  "Node::device_type and LocalGraph::device_type must be the same.");
887 
888  typedef typename device_type::execution_space execution_space;
889  typename execution_space::device_type outputDevice;
890 
891  typedef typename buffer_device_type::execution_space buffer_execution_space;
892  typename buffer_execution_space::device_type bufferOutputDevice;
893 
894  // Convert all Teuchos::Array to Kokkos::View.
895 
896  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
897  // them to device. Since unpacking is done directly in to the local graph
898  // (lclGraph), no copying needs to be performed after unpacking.
899  auto imports_d =
900  create_mirror_view_from_raw_host_array(bufferOutputDevice,
901  imports.getRawPtr(), imports.size(),
902  true, "imports");
903 
904  auto num_packets_per_lid_d =
905  create_mirror_view_from_raw_host_array(bufferOutputDevice,
906  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
907  true, "num_packets_per_lid");
908 
909  auto import_lids_d =
911  importLIDs.getRawPtr(), importLIDs.size(),
912  true, "import_lids");
913 
914  auto local_graph = sourceGraph.getLocalGraph();
915  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
916 
917  // Now do the actual unpack!
918  typedef decltype(local_col_map) local_map_type;
919  UnpackAndCombineCrsGraphImpl::unpackAndCombine<
920  packet_type,local_graph_type,local_map_type,buffer_device_type>(
921  local_graph, local_col_map, imports_d, num_packets_per_lid_d,
922  import_lids_d, combineMode, false, atomic);
923 
924  return;
925 }
926 
927 template<class LO, class GO, class Node>
928 void
929 unpackCrsGraphAndCombineNew(
930  const CrsGraph<LO, GO, Node>& sourceGraph,
931  const Kokkos::DualView<const typename CrsGraph<LO,GO,Node>::packet_type*,
932  typename CrsGraph<LO,GO,Node>::buffer_device_type>& imports,
933  const Kokkos::DualView<const size_t*,
934  typename CrsGraph<LO,GO,Node>::buffer_device_type>& numPacketsPerLID,
935  const Kokkos::DualView<const LO*, typename Node::device_type>& importLIDs,
936  const size_t constantNumPackets,
937  Distributor& distor,
938  const CombineMode combineMode,
939  const bool atomic)
940 {
942  using Kokkos::View;
943  typedef typename Node::device_type device_type;
944  typedef CrsGraph<LO, GO, Node> crs_graph_type;
945  typedef typename crs_graph_type::packet_type packet_type;
946  typedef typename crs_graph_type::local_graph_type local_graph_type;
947  typedef typename crs_graph_type::buffer_device_type buffer_device_type;
948  typedef typename buffer_device_type::memory_space buffer_memory_space;
949  typedef typename device_type::memory_space memory_space;
950 
951  static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
952  "Node::device_type and LocalGraph::device_type must be "
953  "the same.");
954 
955  {
956  auto numPacketsPerLID_nc = castAwayConstDualView(numPacketsPerLID);
957  numPacketsPerLID_nc.template sync<buffer_memory_space>();
958  }
959  auto num_packets_per_lid_d = numPacketsPerLID.template view<buffer_memory_space>();
960 
961  {
962  auto importLIDs_nc = castAwayConstDualView(importLIDs);
963  importLIDs_nc.template sync<memory_space>();
964  }
965  auto import_lids_d = importLIDs.template view<memory_space>();
966 
967  {
968  auto imports_nc = castAwayConstDualView(imports);
969  imports_nc.template sync<buffer_memory_space>();
970  }
971  auto imports_d = imports.template view<buffer_memory_space>();
972 
973  auto local_graph = sourceGraph.getLocalGraph();
974  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
975  typedef decltype(local_col_map) local_map_type;
976 
977  // Now do the actual unpack!
978  UnpackAndCombineCrsGraphImpl::unpackAndCombine<
979  packet_type,local_graph_type,local_map_type,buffer_device_type>(
980  local_graph, local_col_map, imports_d, num_packets_per_lid_d,
981  import_lids_d, combineMode, false, atomic);
982 }
983 
1027 //
1036 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1037 size_t
1039  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1040  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1041  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
1042  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1043  size_t constantNumPackets,
1044  Distributor &distor,
1045  CombineMode combineMode,
1046  size_t numSameIDs,
1047  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1048  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1049 {
1050  using Kokkos::MemoryUnmanaged;
1051  using Kokkos::View;
1052  typedef typename Node::device_type device_type;
1053  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type packet_type;
1054  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type local_graph_type;
1055  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type buffer_device_type;
1056  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1057 
1058  TEUCHOS_TEST_FOR_EXCEPTION
1059  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1060  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1061  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1062  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1063  // process, then the graph is neither locally nor globally indexed.
1064  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
1065  TEUCHOS_TEST_FOR_EXCEPTION
1066  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1067  "CrsGraph 'sourceGraph' must be locally indexed.");
1068  TEUCHOS_TEST_FOR_EXCEPTION
1069  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1070  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1071  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1072 
1073  auto local_graph = sourceGraph.getLocalGraph();
1074  auto permute_from_lids_d =
1076  permuteFromLIDs.getRawPtr(),
1077  permuteFromLIDs.size(), true,
1078  "permute_from_lids");
1079  auto imports_d =
1080  create_mirror_view_from_raw_host_array(buffer_device_type(),
1081  imports.getRawPtr(),
1082  imports.size(), true,
1083  "imports");
1084  auto num_packets_per_lid_d =
1085  create_mirror_view_from_raw_host_array(buffer_device_type(),
1086  numPacketsPerLID.getRawPtr(),
1087  numPacketsPerLID.size(), true,
1088  "num_packets_per_lid");
1089 
1091  packet_type,local_graph_type,buffer_device_type>(
1092  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1093 }
1094 
1108 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1109 void
1111  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1112  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1113  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
1114  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1115  const size_t constantNumPackets,
1116  Distributor& distor,
1117  const CombineMode combineMode,
1118  const size_t numSameIDs,
1119  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1120  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1121  size_t TargetNumRows,
1122  size_t TargetNumNonzeros,
1123  const int MyTargetPID,
1124  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1125  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1126  const Teuchos::ArrayView<const int>& SourcePids,
1127  Teuchos::Array<int>& TargetPids)
1128 {
1129  using Kokkos::View;
1130  using Kokkos::deep_copy;
1131  using Teuchos::ArrayView;
1132  using Teuchos::outArg;
1133  using Teuchos::REDUCE_MAX;
1134  using Teuchos::reduceAll;
1135  typedef LocalOrdinal LO;
1136  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type packet_type;
1137  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type local_graph_type;
1138  typedef typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type buffer_device_type;
1139  typedef typename Node::device_type device_type;
1140  typedef typename device_type::execution_space execution_space;
1141  typedef typename buffer_device_type::execution_space buffer_execution_space;
1142  typedef typename ArrayView<const LO>::size_type size_type;
1143 
1144  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1145 
1146  TEUCHOS_TEST_FOR_EXCEPTION(
1147  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1148  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1149  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1150 
1151  TEUCHOS_TEST_FOR_EXCEPTION(
1152  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1153  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
1154  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1155  const size_type numImportLIDs = importLIDs.size();
1156 
1157  TEUCHOS_TEST_FOR_EXCEPTION(
1158  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1159  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1160  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1161 
1162  // Preseed TargetPids with -1 for local
1163  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1164  TargetPids.resize(TargetNumNonzeros);
1165  }
1166  TargetPids.assign(TargetNumNonzeros, -1);
1167 
1168  // Grab pointers for sourceGraph
1169  auto local_graph = sourceGraph.getLocalGraph();
1170  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
1171 
1172  // Convert input arrays to Kokkos::View
1173  typename execution_space::device_type outputDevice;
1174  typename buffer_execution_space::device_type bufferOutputDevice;
1175 
1176  auto import_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1177  importLIDs.getRawPtr(), importLIDs.size(),
1178  true, "import_lids");
1179 
1180  auto imports_d = create_mirror_view_from_raw_host_array(bufferOutputDevice,
1181  imports.getRawPtr(), imports.size(),
1182  true, "imports");
1183 
1184  auto num_packets_per_lid_d = create_mirror_view_from_raw_host_array(bufferOutputDevice,
1185  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1186  true, "num_packets_per_lid");
1187 
1188  auto permute_from_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1189  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1190  true, "permute_from_lids");
1191 
1192  auto permute_to_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1193  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1194  true, "permute_to_lids");
1195 
1196  auto crs_rowptr_d = create_mirror_view_from_raw_host_array(outputDevice,
1197  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1198  true, "crs_rowptr");
1199 
1200  auto crs_colind_d = create_mirror_view_from_raw_host_array(outputDevice,
1201  CRS_colind.getRawPtr(), CRS_colind.size(),
1202  true, "crs_colidx");
1203 
1204  auto src_pids_d = create_mirror_view_from_raw_host_array(outputDevice,
1205  SourcePids.getRawPtr(), SourcePids.size(),
1206  true, "src_pids");
1207 
1208  auto tgt_pids_d = create_mirror_view_from_raw_host_array(outputDevice,
1209  TargetPids.getRawPtr(), TargetPids.size(),
1210  true, "tgt_pids");
1211 
1212  typedef decltype(local_col_map) local_map_type;
1214  packet_type,local_graph_type,local_map_type,buffer_device_type>(
1215  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1216  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1217  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1218 
1219  // Copy outputs back to host
1220  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1221  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1222  deep_copy(crs_rowptr_h, crs_rowptr_d);
1223 
1224  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1225  CRS_colind.getRawPtr(), CRS_colind.size());
1226  deep_copy(crs_colind_h, crs_colind_d);
1227 
1228  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1229  TargetPids.getRawPtr(), TargetPids.size());
1230  deep_copy(tgt_pids_h, tgt_pids_d);
1231 
1232 }
1233 
1234 } // namespace Details
1235 } // namespace Tpetra
1236 
1237 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1238  template void \
1239  Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1240  const CrsGraph<LO, GO, NT>&, \
1241  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1242  const Teuchos::ArrayView<const size_t>&, \
1243  const Teuchos::ArrayView<const LO>&, \
1244  size_t, \
1245  Distributor&, \
1246  CombineMode, \
1247  const bool); \
1248  template void \
1249  Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1250  const CrsGraph<LO, GO, NT>&, \
1251  const Kokkos::DualView<const typename CrsGraph<LO,GO,NT>::packet_type*, \
1252  typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1253  const Kokkos::DualView<const size_t*, typename CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1254  const Kokkos::DualView<const LO*, NT::device_type>&, \
1255  const size_t, \
1256  Distributor&, \
1257  const CombineMode, \
1258  const bool); \
1259  template void \
1260  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1261  const CrsGraph<LO, GO, NT> &, \
1262  const Teuchos::ArrayView<const LO>&, \
1263  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1264  const Teuchos::ArrayView<const size_t>&, \
1265  const size_t, \
1266  Distributor&, \
1267  const CombineMode, \
1268  const size_t, \
1269  const Teuchos::ArrayView<const LO>&, \
1270  const Teuchos::ArrayView<const LO>&, \
1271  size_t, \
1272  size_t, \
1273  const int, \
1274  const Teuchos::ArrayView<size_t>&, \
1275  const Teuchos::ArrayView<GO>&, \
1276  const Teuchos::ArrayView<const int>&, \
1277  Teuchos::Array<int>&); \
1278  template size_t \
1279  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1280  const CrsGraph<LO, GO, NT> &, \
1281  const Teuchos::ArrayView<const LO> &, \
1282  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1283  const Teuchos::ArrayView<const size_t>&, \
1284  size_t, \
1285  Distributor &, \
1286  CombineMode, \
1287  size_t, \
1288  const Teuchos::ArrayView<const LO>&, \
1289  const Teuchos::ArrayView<const LO>&);
1290 
1291 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Tpetra_Details_OrdinalTraits.hpp
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Tpetra::Details::computeOffsetsFromCounts
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Definition: Tpetra_Details_computeOffsets.hpp:284
Tpetra::Details::unpackAndCombineIntoCrsArrays
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Tpetra::REPLACE
Replace existing values with new values.
Definition: Tpetra_CombineMode.hpp:97
Tpetra::Details::UnpackAndCombineCrsGraphImpl::UnpackAndCombineFunctor
Unpacks and combines a single row of the CrsGraph.
Definition: Tpetra_Details_unpackCrsGraphAndCombine_def.hpp:140
Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackRow
KOKKOS_FUNCTION int unpackRow(typename Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, typename Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
Definition: Tpetra_Details_unpackCrsGraphAndCombine_def.hpp:103
Tpetra::Details::create_mirror_view_from_raw_host_array
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Definition: Tpetra_Details_createMirrorView.hpp:201
Tpetra_CrsGraph_decl.hpp
Declaration of the Tpetra::CrsGraph class.
Tpetra_Details_Behavior.hpp
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Tpetra::Details::UnpackAndCombineCrsGraphImpl::setupRowPointersForRemotes
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, Device > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &imports, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
Definition: Tpetra_Details_unpackCrsGraphAndCombine_def.hpp:468
Tpetra::Details::unpackCrsGraphAndCombine
void unpackCrsGraphAndCombine(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, const bool atomic)
Unpack the imported column indices and combine into graph.
Tpetra::Classes::CrsGraph::getLocalGraph
local_graph_type getLocalGraph() const
Get the local graph.
Definition: Tpetra_CrsGraph_def.hpp:4766
Details
Implementation details of Tpetra.
Tpetra_Details_castAwayConstDualView.hpp
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Tpetra::Distributor
Sets up and executes a communication plan for a Tpetra DistObject.
Definition: Tpetra_Distributor.hpp:188
Tpetra::Details::Classes::LocalMap
"Local" part of Map suitable for Kokkos kernels.
Definition: Tpetra_Details_LocalMap.hpp:72
Tpetra::Classes::CrsGraph::isLocallyIndexed
bool isLocallyIndexed() const override
If graph indices are in the local range, this function returns true. Otherwise, this function returns...
Definition: Tpetra_CrsGraph_def.hpp:1147
Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine
void unpackAndCombine(const LocalGraph &local_graph, const LocalMap &local_map, const Kokkos::View< const Packet *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const typename LocalMap::local_ordinal_type *, typename LocalMap::device_type, Kokkos::MemoryUnmanaged > &import_lids, const Tpetra::CombineMode combine_mode, const bool unpack_pids, const bool atomic)
Perform the unpack operation for the graph.
Definition: Tpetra_Details_unpackCrsGraphAndCombine_def.hpp:315
Tpetra::Details::Classes::LocalMap::getLocalElement
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index.
Definition: Tpetra_Details_LocalMap.hpp:153
Tpetra_Details_getEntryOnHost.hpp
Declaration and definition of Tpetra::Details::getEntryOnHost.
Tpetra::Details::unpackAndCombineIntoCrsArrays
void unpackAndCombineIntoCrsArrays(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GlobalOrdinal > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Definition: Tpetra_Details_unpackCrsGraphAndCombine_def.hpp:1110
Tpetra::Classes::DistObject< GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node >::packet_type
::Kokkos::Details::ArithTraits< GlobalOrdinal >::val_type packet_type
The type of each datum being sent or received in an Import or Export.
Definition: Tpetra_DistObject_decl.hpp:361
Tpetra_Details_computeOffsets.hpp
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
Tpetra::Details::LocalMap
::Tpetra::Details::Classes::LocalMap< LocalOrdinal, GlobalOrdinal, DeviceType > LocalMap
Alias for Tpetra::Details::Classes::LocalMap.
Definition: Tpetra_Details_LocalMap_fwd.hpp:72
Tpetra::Classes::CrsGraph::local_graph_type
Kokkos::StaticCrsGraph< LocalOrdinal, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Definition: Tpetra_CrsGraph_decl.hpp:292
Tpetra::Details::unpackAndCombineWithOwningPIDsCount
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Tpetra::deep_copy
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Definition: Tpetra_MultiVector_decl.hpp:2453
Tpetra::Details::castAwayConstDualView
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
Definition: Tpetra_Details_castAwayConstDualView.hpp:64
Tpetra::Classes::DistObject< GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node >::buffer_device_type
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Definition: Tpetra_DistObject_decl.hpp:710
Tpetra_Details_createMirrorView.hpp
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Tpetra::Classes::CrsGraph::getColMap
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Definition: Tpetra_CrsGraph_def.hpp:913
Tpetra::Details::unpackAndCombineWithOwningPIDsCount
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Definition: Tpetra_Details_unpackCrsGraphAndCombine_def.hpp:1038
Tpetra::INSERT
Insert new values that don't currently exist.
Definition: Tpetra_CombineMode.hpp:96
Tpetra::Classes::CrsGraph
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Definition: Tpetra_CrsGraph_decl.hpp:259
Tpetra::CombineMode
CombineMode
Rule for combining data in an Import or Export.
Definition: Tpetra_CombineMode.hpp:94