Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_packCrsGraph_def.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
43 #define TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
53 #include "Tpetra_CrsGraph_decl.hpp"
54 #include <memory>
55 #include <string>
56 
78 
79 namespace Tpetra {
80 
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
82 // Forward declaration of Distributor
83 class Distributor;
84 #endif // DOXYGEN_SHOULD_SKIP_THIS
85 
86 //
87 // Users must never rely on anything in the Details namespace.
88 //
89 namespace Details {
90 
91 namespace PackCrsGraphImpl {
99 template<class OutputOffsetsViewType,
100  class CountsViewType,
101  class InputOffsetsViewType,
102  class InputLocalRowIndicesViewType,
103  class InputLocalRowPidsViewType,
104  const bool debug =
105 #ifdef HAVE_TPETRA_DEBUG
106  true
107 #else
108  false
109 #endif // HAVE_TPETRA_DEBUG
110  >
112 public:
113  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
114  typedef typename CountsViewType::non_const_value_type count_type;
115  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
116  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
117  typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
118  // output Views drive where execution happens.
119  typedef typename OutputOffsetsViewType::device_type device_type;
120  static_assert (std::is_same<typename CountsViewType::device_type::execution_space,
121  typename device_type::execution_space>::value,
122  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
123  static_assert (Kokkos::Impl::is_view<OutputOffsetsViewType>::value,
124  "OutputOffsetsViewType must be a Kokkos::View.");
125  static_assert (std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
126  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
127  static_assert (std::is_integral<output_offset_type>::value,
128  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
129  static_assert (Kokkos::Impl::is_view<CountsViewType>::value,
130  "CountsViewType must be a Kokkos::View.");
131  static_assert (std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
132  "CountsViewType must be a nonconst Kokkos::View.");
133  static_assert (std::is_integral<count_type>::value,
134  "The type of each entry of CountsViewType must be a built-in integer type.");
135  static_assert (Kokkos::Impl::is_view<InputOffsetsViewType>::value,
136  "InputOffsetsViewType must be a Kokkos::View.");
137  static_assert (std::is_integral<input_offset_type>::value,
138  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
139  static_assert (Kokkos::Impl::is_view<InputLocalRowIndicesViewType>::value,
140  "InputLocalRowIndicesViewType must be a Kokkos::View.");
141  static_assert (std::is_integral<local_row_index_type>::value,
142  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
143 
144  NumPacketsAndOffsetsFunctor(const OutputOffsetsViewType& outputOffsets,
145  const CountsViewType& counts,
146  const InputOffsetsViewType& rowOffsets,
147  const InputLocalRowIndicesViewType& lclRowInds,
148  const InputLocalRowPidsViewType& lclRowPids) :
149  outputOffsets_ (outputOffsets),
150  counts_ (counts),
151  rowOffsets_ (rowOffsets),
152  lclRowInds_ (lclRowInds),
153  lclRowPids_ (lclRowPids),
154  error_ ("error") // don't forget this, or you'll get segfaults!
155  {
156  if (debug) {
157  const size_t numRowsToPack = static_cast<size_t> (lclRowInds_.extent (0));
158 
159  if (numRowsToPack != static_cast<size_t> (counts_.extent (0))) {
160  std::ostringstream os;
161  os << "lclRowInds.extent(0) = " << numRowsToPack
162  << " != counts.extent(0) = " << counts_.extent (0)
163  << ".";
164  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
165  }
166  if (static_cast<size_t> (numRowsToPack + 1) !=
167  static_cast<size_t> (outputOffsets_.extent (0))) {
168  std::ostringstream os;
169  os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
170  << " != outputOffsets.extent(0) = " << outputOffsets_.extent (0)
171  << ".";
172  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
173  }
174  }
175  }
176 
177  KOKKOS_INLINE_FUNCTION void
178  operator() (const local_row_index_type& curInd,
179  output_offset_type& update,
180  const bool final) const
181  {
182  if (debug) {
183  if (curInd < static_cast<local_row_index_type> (0)) {
184  error_ () = 1;
185  return;
186  }
187  }
188 
189  if (final) {
190  if (debug) {
191  if (curInd >= static_cast<local_row_index_type> (outputOffsets_.extent (0))) {
192  error_ () = 2;
193  return;
194  }
195  }
196  outputOffsets_(curInd) = update;
197  }
198 
199  if (curInd < static_cast<local_row_index_type> (counts_.extent (0))) {
200  const auto lclRow = lclRowInds_(curInd);
201  if (static_cast<size_t> (lclRow + 1) >= static_cast<size_t> (rowOffsets_.extent (0)) ||
202  static_cast<local_row_index_type> (lclRow) < static_cast<local_row_index_type> (0)) {
203  error_ () = 3;
204  return;
205  }
206  // count_type could differ from the type of each row offset.
207  // For example, row offsets might each be 64 bits, but if their
208  // difference always fits in 32 bits, we may then safely use a
209  // 32-bit count_type.
210  const count_type count =
211  static_cast<count_type> (rowOffsets_(lclRow+1) - rowOffsets_(lclRow));
212 
213  // We pack first the global column indices and then pids (if any),
214  // However, if the number of entries in the row is zero, we pack nothing.
215  const count_type numEntToPack = (count == 0)
216  ? static_cast<count_type>(0)
217  : count * (1 + (lclRowPids_.size() > 0 ? 1 : 0));
218 
219  if (final) {
220  counts_(curInd) = numEntToPack;
221  }
222  update += numEntToPack;
223  }
224  }
225 
226  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
227  // have join both with and without volatile! Otherwise intrawarp
228  // joins are really slow on GPUs.
229 
231  int getError () const {
232  auto error_h = Kokkos::create_mirror_view (error_);
233  Kokkos::deep_copy (error_h, error_);
234  return error_h ();
235  }
236 
237 private:
238  OutputOffsetsViewType outputOffsets_;
239  CountsViewType counts_;
240  typename InputOffsetsViewType::const_type rowOffsets_;
241  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
242  typename InputLocalRowPidsViewType::const_type lclRowPids_;
243  Kokkos::View<int, device_type> error_;
244 };
245 
255 template<class OutputOffsetsViewType,
256  class CountsViewType,
257  class InputOffsetsViewType,
258  class InputLocalRowIndicesViewType,
259  class InputLocalRowPidsViewType>
260 typename CountsViewType::non_const_value_type
261 computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
262  const CountsViewType& counts,
263  const InputOffsetsViewType& rowOffsets,
264  const InputLocalRowIndicesViewType& lclRowInds,
265  const InputLocalRowPidsViewType& lclRowPids)
266 {
267  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
268  CountsViewType, typename InputOffsetsViewType::const_type,
269  typename InputLocalRowIndicesViewType::const_type,
270  typename InputLocalRowPidsViewType::const_type> functor_type;
271  typedef typename CountsViewType::non_const_value_type count_type;
272  typedef typename OutputOffsetsViewType::size_type size_type;
273  typedef typename OutputOffsetsViewType::execution_space execution_space;
274  typedef typename functor_type::local_row_index_type LO;
275  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
276  const char prefix[] = "computeNumPacketsAndOffsets: ";
277 
278  count_type count = 0;
279  const count_type numRowsToPack = lclRowInds.extent (0);
280 
281  if (numRowsToPack == 0) {
282  return count;
283  }
284  else {
285  TEUCHOS_TEST_FOR_EXCEPTION
286  (rowOffsets.extent (0) <= static_cast<size_type> (1),
287  std::invalid_argument, prefix << "There is at least one row to pack, "
288  "but the graph has no rows. lclRowInds.extent(0) = " <<
289  numRowsToPack << ", but rowOffsets.extent(0) = " <<
290  rowOffsets.extent (0) << " <= 1.");
291  TEUCHOS_TEST_FOR_EXCEPTION
292  (outputOffsets.extent (0) !=
293  static_cast<size_type> (numRowsToPack + 1), std::invalid_argument,
294  prefix << "Output dimension does not match number of rows to pack. "
295  << "outputOffsets.extent(0) = " << outputOffsets.extent (0)
296  << " != lclRowInds.extent(0) + 1 = "
297  << static_cast<size_type> (numRowsToPack + 1) << ".");
298  TEUCHOS_TEST_FOR_EXCEPTION
299  (counts.extent (0) != numRowsToPack, std::invalid_argument,
300  prefix << "counts.extent(0) = " << counts.extent (0)
301  << " != numRowsToPack = " << numRowsToPack << ".");
302 
303  functor_type f (outputOffsets, counts, rowOffsets, lclRowInds, lclRowPids);
304  Kokkos::parallel_scan (range_type (0, numRowsToPack + 1), f);
305 
306  // At least in debug mode, this functor checks for errors.
307  const int errCode = f.getError ();
308  TEUCHOS_TEST_FOR_EXCEPTION
309  (errCode != 0, std::runtime_error, prefix << "parallel_scan error code "
310  << errCode << " != 0.");
311 
312 #if 0
313  size_t total = 0;
314  for (LO k = 0; k < numRowsToPack; ++k) {
315  total += counts[k];
316  }
317  if (outputOffsets(numRowsToPack) != total) {
318  if (errStr.get () == NULL) {
319  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
320  }
321  std::ostringstream& os = *errStr;
322  os << prefix
323  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
324  << outputOffsets(numRowsToPack) << " != sum of counts = "
325  << total << "." << std::endl;
326  if (numRowsToPack != 0) {
327  // Only print the array if it's not too long.
328  if (numRowsToPack < static_cast<LO> (10)) {
329  os << "outputOffsets: [";
330  for (LO i = 0; i <= numRowsToPack; ++i) {
331  os << outputOffsets(i);
332  if (static_cast<LO> (i + 1) <= numRowsToPack) {
333  os << ",";
334  }
335  }
336  os << "]" << std::endl;
337  os << "counts: [";
338  for (LO i = 0; i < numRowsToPack; ++i) {
339  os << counts(i);
340  if (static_cast<LO> (i + 1) < numRowsToPack) {
341  os << ",";
342  }
343  }
344  os << "]" << std::endl;
345  }
346  else {
347  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
348  << outputOffsets(numRowsToPack-1) << "." << std::endl;
349  }
350  }
351  count = outputOffsets(numRowsToPack);
352  return {false, errStr};
353  }
354 #endif // HAVE_TPETRA_DEBUG
355 
356  // Get last entry of outputOffsets, which is the sum of the entries
357  // of counts. Don't assume UVM.
358  using Tpetra::Details::getEntryOnHost;
359  return static_cast<count_type> (getEntryOnHost (outputOffsets,
360  numRowsToPack));
361  }
362 }
363 
374 template<class Packet, class ColumnMap, class BufferDeviceType>
375 KOKKOS_FUNCTION
376 size_t
377 packRow(const ColumnMap& col_map,
378  const Kokkos::View<Packet*, BufferDeviceType>& exports,
381  const size_t offset,
382  const size_t num_ent,
383  const bool pack_pids)
384 {
385  using Kokkos::subview;
386  typedef typename ColumnMap::local_ordinal_type LO;
387  typedef typename ColumnMap::global_ordinal_type GO;
388 
389  if (num_ent == 0) {
390  // Empty rows always take zero bytes, to ensure sparsity.
391  return static_cast<size_t>(0);
392  }
393 
394  size_t num_ent_packed = num_ent;
395  if (pack_pids) num_ent_packed += num_ent;
396  {
397  // Copy column indices one at a time, so that we don't need
398  // temporary storage.
399  for (size_t k = 0; k < num_ent; ++k) {
400  const LO lid = lids_in[k];
401  const GO gid = col_map.getGlobalElement (lid);
402  exports(offset+k) = gid;
403  }
404  // Copy PIDs one at a time, so that we don't need temporary storage.
405  if (pack_pids) {
406  for (size_t k = 0; k < num_ent; ++k) {
407  const LO lid = lids_in[k];
408  const int pid = pids_in[lid];
409  exports(offset+num_ent+k) = static_cast<GO>(pid);
410  }
411  }
412  }
413  return num_ent_packed;
414 }
415 
416 template<class Packet, class LocalGraph, class LocalMap, class BufferDeviceType>
417 struct PackCrsGraphFunctor {
418  typedef LocalGraph local_graph_type;
419  typedef LocalMap local_map_type;
420  typedef typename local_map_type::local_ordinal_type LO;
421  typedef typename local_map_type::global_ordinal_type GO;
422  typedef typename local_graph_type::device_type device_type;
423 
424  typedef Kokkos::View<const size_t*, BufferDeviceType>
425  num_packets_per_lid_view_type;
426  typedef Kokkos::View<const size_t*, BufferDeviceType> offsets_view_type;
427  typedef Kokkos::View<Packet*, BufferDeviceType> exports_view_type;
429  export_lids_view_type;
431  source_pids_view_type;
432 
433  typedef typename num_packets_per_lid_view_type::non_const_value_type
434  count_type;
435  typedef typename offsets_view_type::non_const_value_type
436  offset_type;
437  typedef Kokkos::pair<int, LO> value_type;
438 
439  static_assert (std::is_same<LO, typename local_graph_type::data_type>::value,
440  "local_map_type::local_ordinal_type and "
441  "local_graph_type::data_type must be the same.");
442 
443  local_graph_type local_graph;
444  local_map_type local_col_map;
445  exports_view_type exports;
446  num_packets_per_lid_view_type num_packets_per_lid;
447  export_lids_view_type export_lids;
448  source_pids_view_type source_pids;
449  offsets_view_type offsets;
450  bool pack_pids;
451 
452  PackCrsGraphFunctor(const local_graph_type& local_graph_in,
453  const local_map_type& local_col_map_in,
454  const exports_view_type& exports_in,
455  const num_packets_per_lid_view_type& num_packets_per_lid_in,
456  const export_lids_view_type& export_lids_in,
457  const source_pids_view_type& source_pids_in,
458  const offsets_view_type& offsets_in,
459  const bool pack_pids_in) :
460  local_graph (local_graph_in),
461  local_col_map (local_col_map_in),
462  exports (exports_in),
463  num_packets_per_lid (num_packets_per_lid_in),
464  export_lids (export_lids_in),
465  source_pids (source_pids_in),
466  offsets (offsets_in),
467  pack_pids (pack_pids_in)
468  {
469  const LO numRows = local_graph_in.numRows ();
470  const LO rowMapDim =
471  static_cast<LO> (local_graph.row_map.extent (0));
472  TEUCHOS_TEST_FOR_EXCEPTION
473  (numRows != 0 && rowMapDim != numRows + static_cast<LO> (1),
474  std::logic_error, "local_graph.row_map.extent(0) = "
475  << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
476  }
477 
478  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const
479  {
480  using ::Tpetra::Details::OrdinalTraits;
481  dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
482  }
483 
484  KOKKOS_INLINE_FUNCTION void
485  join (volatile value_type& dst, const volatile value_type& src) const
486  {
487  // `dst` should reflect the first (least) bad index and all other
488  // associated error codes and data, so prefer keeping it.
489  if (src.first != 0 && dst.first == 0) {
490  dst = src;
491  }
492  }
493 
494  KOKKOS_INLINE_FUNCTION
495  void operator() (const LO i, value_type& dst) const
496  {
497  const size_t offset = offsets[i];
498  const LO export_lid = export_lids[i];
499  const size_t buf_size = exports.size();
500  const size_t num_packets_this_lid = num_packets_per_lid(i);
501  const size_t num_ent =
502  static_cast<size_t> (local_graph.row_map[export_lid+1]
503  - local_graph.row_map[export_lid]);
504 
505  // Only pack this row's data if it has a nonzero number of
506  // entries. We can do this because receiving processes get the
507  // number of packets, and will know that zero packets means zero
508  // entries.
509  if (num_ent == 0) {
510  return;
511  }
512 
513  if (export_lid >= static_cast<LO>(local_graph.numRows())) {
514  if (dst.first != 0) { // keep only the first error
515  dst = Kokkos::make_pair (1, i); // invalid row
516  }
517  return;
518  }
519  else if ((offset > buf_size || offset + num_packets_this_lid > buf_size)) {
520  if (dst.first != 0) { // keep only the first error
521  dst = Kokkos::make_pair (2, i); // out of bounds
522  }
523  return;
524  }
525 
526  // We can now pack this row
527 
528  // Since the graph is locally indexed on the calling process, we
529  // have to use its column Map (which it _must_ have in this case)
530  // to convert to global indices.
531  const auto row_beg = local_graph.row_map[export_lid];
532  const auto row_end = local_graph.row_map[export_lid + 1];
533  auto lids_in = subview (local_graph.entries,
534  Kokkos::make_pair (row_beg, row_end));
535  typedef local_map_type LMT;
536  typedef Packet PT;
537  typedef BufferDeviceType BDT;
538  size_t num_ent_packed_this_row =
539  packRow<PT,LMT,BDT>(local_col_map, exports, lids_in,
540  source_pids, offset, num_ent, pack_pids);
541  if (num_ent_packed_this_row != num_packets_this_lid) {
542  if (dst.first != 0) { // keep only the first error
543  dst = Kokkos::make_pair (3, i);
544  }
545  }
546  }
547 };
548 
556 template<class Packet, class LocalGraph, class LocalMap, class BufferDeviceType>
557 void
558 do_pack(const LocalGraph& local_graph,
559  const LocalMap& local_map,
560  const Kokkos::View<Packet*, BufferDeviceType>& exports,
561  const typename PackTraits<
562  size_t,
563  BufferDeviceType
564  >::input_array_type& num_packets_per_lid,
565  const typename PackTraits<
566  typename LocalMap::local_ordinal_type,
567  typename LocalGraph::device_type
568  >::input_array_type& export_lids,
569  const typename PackTraits<
570  int,
571  typename LocalGraph::device_type
572  >::input_array_type& source_pids,
573  const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
574  const bool pack_pids)
575 {
576  typedef typename LocalMap::local_ordinal_type LO;
577  typedef typename LocalGraph::device_type device_type;
578  typedef Kokkos::RangePolicy<typename device_type::execution_space, LO> range_type;
579  const char prefix[] = "Tpetra::Details::PackCrsGraphImpl::do_pack: ";
580 
581  if (export_lids.extent (0) != 0) {
582  TEUCHOS_TEST_FOR_EXCEPTION
583  (static_cast<size_t> (offsets.extent (0)) !=
584  static_cast<size_t> (export_lids.extent (0) + 1),
585  std::invalid_argument, prefix << "offsets.extent(0) = "
586  << offsets.extent (0) << " != export_lids.extent(0) (= "
587  << export_lids.extent (0) << ") + 1.");
588  TEUCHOS_TEST_FOR_EXCEPTION
589  (export_lids.extent (0) != num_packets_per_lid.extent (0),
590  std::invalid_argument, prefix << "export_lids.extent(0) = " <<
591  export_lids.extent (0) << " != num_packets_per_lid.extent(0) = "
592  << num_packets_per_lid.extent (0) << ".");
593  // If exports has nonzero length at this point, then the graph
594  // has at least one entry to pack. Thus, if packing process
595  // ranks, we had better have at least one process rank to pack.
596  TEUCHOS_TEST_FOR_EXCEPTION
597  (pack_pids && exports.extent (0) != 0 &&
598  source_pids.extent (0) == 0, std::invalid_argument, prefix <<
599  "pack_pids is true, and exports.extent(0) = " <<
600  exports.extent (0) << " != 0, meaning that we need to pack at "
601  "least one graph entry, but source_pids.extent(0) = 0.");
602  }
603 
604  typedef PackCrsGraphFunctor<Packet,LocalGraph,LocalMap,BufferDeviceType> pack_functor_type;
605  pack_functor_type f (local_graph, local_map, exports,
606  num_packets_per_lid, export_lids,
607  source_pids, offsets, pack_pids);
608 
609  typename pack_functor_type::value_type result;
610  range_type range (0, num_packets_per_lid.extent (0));
611  Kokkos::parallel_reduce (range, f, result);
612 
613  if (result.first != 0) {
614  std::ostringstream os;
615 
616  if (result.first == 1) { // invalid local row index
617  auto export_lids_h = Kokkos::create_mirror_view (export_lids);
618  Kokkos::deep_copy (export_lids_h, export_lids);
619  const auto firstBadLid = export_lids_h(result.second);
620  os << "First bad export LID: export_lids(i=" << result.second << ") = "
621  << firstBadLid;
622  }
623  else if (result.first == 2) { // invalid offset
624  auto offsets_h = Kokkos::create_mirror_view (offsets);
625  Kokkos::deep_copy (offsets_h, offsets);
626  const auto firstBadOffset = offsets_h(result.second);
627 
628  auto num_packets_per_lid_h =
629  Kokkos::create_mirror_view (num_packets_per_lid);
630  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid);
631  os << "First bad offset: offsets(i=" << result.second << ") = "
632  << firstBadOffset << ", num_packets_per_lid(i) = "
633  << num_packets_per_lid_h(result.second) << ", buf_size = "
634  << exports.size ();
635  }
636 
637  TEUCHOS_TEST_FOR_EXCEPTION
638  (true, std::runtime_error, prefix << "PackCrsGraphFunctor reported "
639  "error code " << result.first << " for the first bad row "
640  << result.second << ". " << os.str ());
641  }
642 }
643 
670 template<typename LO, typename GO, typename NT>
671 void
672 packCrsGraph(const CrsGraph<LO,GO,NT>& sourceGraph,
673  Kokkos::DualView<typename CrsGraph<LO,GO,NT>::packet_type*,
674  typename CrsGraph<LO,GO,NT>::buffer_device_type>& exports,
675  const Kokkos::View<size_t*,
676  typename CrsGraph<LO,GO,NT>::buffer_device_type>& num_packets_per_lid,
677  const Kokkos::View<const LO*, typename NT::device_type>& export_lids,
678  const Kokkos::View<const int*, typename NT::device_type>& export_pids,
679  size_t& constant_num_packets,
680  const bool pack_pids,
681  Distributor& /* dist */)
682 {
683  using Kokkos::View;
684  typedef typename CrsGraph<LO,GO,NT>::packet_type packet_type;
685  typedef typename CrsGraph<LO,GO,NT>::buffer_device_type buffer_device_type;
686  typedef typename buffer_device_type::execution_space execution_space;
687  typedef Kokkos::DualView<packet_type*,buffer_device_type> exports_view_type;
688  const char prefix[] = "Tpetra::Details::packCrsGraph: ";
689  constexpr bool debug = false;
690 
691  auto local_graph = sourceGraph.getLocalGraph ();
692  auto local_col_map = sourceGraph.getColMap ()->getLocalMap ();
693 
694  // Setting this to zero tells the caller to expect a possibly
695  // different ("nonconstant") number of packets per local index
696  // (i.e., a possibly different number of entries per row).
697  constant_num_packets = 0;
698 
699  const size_t num_export_lids =
700  static_cast<size_t> (export_lids.extent (0));
701  TEUCHOS_TEST_FOR_EXCEPTION
702  (num_export_lids !=
703  static_cast<size_t> (num_packets_per_lid.extent (0)),
704  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
705  << num_export_lids << " != num_packets_per_lid.extent(0) = "
706  << num_packets_per_lid.extent (0) << ".");
707  if (num_export_lids != 0) {
708  TEUCHOS_TEST_FOR_EXCEPTION
709  (num_packets_per_lid.data () == NULL, std::invalid_argument,
710  prefix << "num_export_lids = "<< num_export_lids << " != 0, but "
711  "num_packets_per_lid.data() = "
712  << num_packets_per_lid.data () << " == NULL.");
713  }
714 
715  if (num_export_lids == 0) {
716  // FIXME (26 Apr 2016) Fences around (UVM) allocations only
717  // temporarily needed for #227 debugging. Should be able to
718  // remove them after that's fixed.
719  execution_space::fence ();
720  exports = exports_view_type ("exports", 0);
721  execution_space::fence ();
722  return;
723  }
724 
725  // Array of offsets into the pack buffer.
726  Kokkos::View<size_t*,buffer_device_type> offsets ("offsets", num_export_lids + 1);
727 
728  // Compute number of packets per LID (row to send), as well as
729  // corresponding offsets (the prefix sum of the packet counts).
730  const size_t count =
731  computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
732  local_graph.row_map, export_lids, export_pids);
733 
734  // Resize the output pack buffer if needed.
735  if (count > static_cast<size_t> (exports.extent (0))) {
736  // FIXME (26 Apr 2016) Fences around (UVM) allocations only
737  // temporarily needed for #227 debugging. Should be able to
738  // remove them after that's fixed.
739  execution_space::fence ();
740  exports = exports_view_type ("exports", count);
741  if (debug) {
742  std::ostringstream os;
743  os << "*** exports resized to " << count << std::endl;
744  std::cerr << os.str ();
745  }
746  execution_space::fence ();
747  }
748  if (debug) {
749  std::ostringstream os;
750  os << "*** count: " << count << ", exports.extent(0): "
751  << exports.extent (0) << std::endl;
752  std::cerr << os.str ();
753  }
754 
755  // If exports has nonzero length at this point, then the graph has
756  // at least one entry to pack. Thus, if packing process ranks, we
757  // had better have at least one process rank to pack.
758  TEUCHOS_TEST_FOR_EXCEPTION
759  (pack_pids && exports.extent (0) != 0 &&
760  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
761  "pack_pids is true, and exports.extent(0) = " <<
762  exports.extent (0) << " != 0, meaning that we need to pack at least "
763  "one graph entry, but export_pids.extent(0) = 0.");
764 
765  typedef typename std::decay<decltype (local_graph)>::type
766  local_graph_type;
767  typedef typename std::decay<decltype (local_col_map)>::type
768  local_map_type;
769  typedef typename exports_view_type::t_dev dev_exports_view_type;
770  typedef typename dev_exports_view_type::memory_space buf_mem_space;
771  exports.template modify<buf_mem_space> ();
772  auto exports_d = exports.template view<buf_mem_space> ();
773  do_pack<packet_type,local_graph_type,local_map_type,buffer_device_type>
774  (local_graph, local_col_map, exports_d, num_packets_per_lid,
775  export_lids, export_pids, offsets, pack_pids);
776  // If we got this far, we succeeded.
777 }
778 
779 } // namespace PackCrsGraphImpl
780 
781 template<typename LO, typename GO, typename NT>
782 void
784  Teuchos::Array<typename CrsGraph<LO,GO,NT>::packet_type>& exports,
785  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
786  const Teuchos::ArrayView<const LO>& exportLIDs,
787  size_t& constantNumPackets,
788  Distributor& distor)
789 {
790  typedef typename CrsGraph<LO,GO,NT>::packet_type packet_type;
791  typedef typename CrsGraph<LO,GO,NT>::local_graph_type local_graph_type;
792  typedef typename local_graph_type::device_type device_type;
793  typedef typename Kokkos::View<size_t*, device_type>::HostMirror::execution_space host_exec_space;
794  typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
795 
796  // mfh 23 Aug 2017: Fix for #1088 requires pack / unpack buffers to
797  // have a possibly different memory space (CudaSpace) than the
798  // default CUDA memory space (currently CudaUVMSpace).
799  typedef typename device_type::execution_space buffer_exec_space;
800 #ifdef KOKKOS_ENABLE_CUDA
801  typedef typename std::conditional<
802  std::is_same<
803  buffer_exec_space, Kokkos::Cuda
804  >::value,
805  Kokkos::CudaSpace,
806  typename device_type::memory_space
807  >::type buffer_memory_space;
808 #else
809  typedef typename device_type::memory_space buffer_memory_space;
810 #endif // KOKKOS_ENABLE_CUDA
811  // @MFH: why not use CrsGraph<LO,GO,NT>::buffer_device_type???
812  typedef Kokkos::Device<buffer_exec_space,
813  buffer_memory_space> buffer_device_type;
814 
815  // Convert all Teuchos::Array to Kokkos::View
816 
817  // This is an output array, so we don't have to copy to device here.
818  // However, we'll have to remember to copy back to host when done.
819  typename local_graph_type::device_type outputDevice;
820  auto num_packets_per_lid_d =
822  numPacketsPerLID.getRawPtr (),
823  numPacketsPerLID.size (), false,
824  "num_packets_per_lid");
825  // This is an input array, so we have to copy to device here.
826  // However, we never need to copy it back to host.
827  auto export_lids_d =
829  exportLIDs.getRawPtr (),
830  exportLIDs.size (), true,
831  "export_lids");
832  // Create an empty array of PIDs
833  Kokkos::View<int*, device_type> export_pids_d ("export_pids", 0);
834 
835  Kokkos::DualView<packet_type*,buffer_device_type> exports_dv ("exports", 0);
836  constexpr bool pack_pids = false;
837  PackCrsGraphImpl::packCrsGraph<LO,GO,NT>(
838  sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
839  export_pids_d, constantNumPackets, pack_pids, distor);
840  // The counts are an output of packCrsGraph, so we have to copy
841  // them back to host.
842  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
843  (numPacketsPerLID.getRawPtr (),
844  numPacketsPerLID.size ());
845  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d);
846 
847  // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
848  // exports_dv above, then we have two host copies for exports_h.
849 
850  // The exports are an output of packCrsGraph, so we have to
851  // copy them back to host.
852  if (static_cast<size_t> (exports.size ()) !=
853  static_cast<size_t> (exports_dv.extent (0))) {
854  exports.resize (exports_dv.extent (0));
855  }
856  Kokkos::View<packet_type*, host_dev_type> exports_h (exports.getRawPtr (),
857  exports.size ());
858  Kokkos::deep_copy (exports_h, exports_dv.d_view);
859 }
860 
861 template<typename LO, typename GO, typename NT>
862 void
864  Kokkos::DualView<typename CrsGraph<LO,GO,NT>::packet_type*,
865  typename CrsGraph<LO,GO,NT>::buffer_device_type>& exports,
866  const Kokkos::DualView<size_t*,
867  typename CrsGraph<LO,GO,NT>::buffer_device_type>& numPacketsPerLID,
868  const Kokkos::DualView<const LO*, typename NT::device_type>& exportLIDs,
869  size_t& constantNumPackets,
870  Distributor& distor)
871 {
872  typedef typename CrsGraph<LO,GO,NT>::local_graph_type local_graph_type;
873  typedef typename local_graph_type::device_type device_type;
874 
875  // mfh 23 Aug 2017: Fix for #1088 requires pack / unpack buffers to
876  // have a possibly different memory space (CudaSpace) than the
877  // default CUDA memory space (currently CudaUVMSpace).
878 #ifdef KOKKOS_ENABLE_CUDA
879  typedef typename device_type::execution_space buffer_exec_space;
880  typedef typename std::conditional<
881  std::is_same<
882  buffer_exec_space, Kokkos::Cuda
883  >::value,
884  Kokkos::CudaSpace,
885  typename device_type::memory_space
886  >::type buffer_memory_space;
887 #else
888  typedef typename device_type::memory_space buffer_memory_space;
889 #endif // KOKKOS_ENABLE_CUDA
890 
891  // Create an empty array of PIDs, since the interface needs it.
892  Kokkos::View<int*, device_type> exportPIDs_d ("exportPIDs", 0);
893  constexpr bool pack_pids = false;
894 
895  // Write-only device access
896  auto numPacketsPerLID_nc = numPacketsPerLID; // const DV& -> DV
897  numPacketsPerLID_nc.modified_host() = 0;
898  numPacketsPerLID_nc.modified_device() = 1;
899  auto numPacketsPerLID_d = numPacketsPerLID.template view<buffer_memory_space> ();
900 
901  // Read-only device access
902  auto exportLIDs_nc = Tpetra::Details::castAwayConstDualView (exportLIDs);
903  exportLIDs_nc.template sync<typename device_type::memory_space> ();
904  auto exportLIDs_d = exportLIDs.template view<typename device_type::memory_space> ();
905 
906  PackCrsGraphImpl::packCrsGraph<LO,GO,NT>(
907  sourceGraph, exports, numPacketsPerLID_d, exportLIDs_d,
908  exportPIDs_d, constantNumPackets, pack_pids, distor);
909 }
910 
911 template<typename LO, typename GO, typename NT>
912 void
914  Kokkos::DualView<typename CrsGraph<LO,GO,NT>::packet_type*,
916  exports_dv,
917  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
918  const Teuchos::ArrayView<const LO>& exportLIDs,
919  const Teuchos::ArrayView<const int>& sourcePIDs,
920  size_t& constantNumPackets,
921  Distributor& distor)
922 {
923  typedef typename CrsGraph<LO,GO,NT>::local_graph_type local_graph_type;
924  typedef typename CrsGraph<LO,GO,NT>::packet_type packet_type;
925  typedef typename CrsGraph<LO,GO,NT>::buffer_device_type buffer_device_type;
926  typedef typename Kokkos::DualView<packet_type*, buffer_device_type>::t_host::execution_space host_exec_space;
927  typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
928 
929  typename local_graph_type::device_type outputDevice;
930 
931  // Convert all Teuchos::Array to Kokkos::View
932 
933  // This is an output array, so we don't have to copy to device here.
934  // However, we'll have to remember to copy back to host when done.
935  auto num_packets_per_lid_d =
936  create_mirror_view_from_raw_host_array (buffer_device_type (),
937  numPacketsPerLID.getRawPtr (),
938  numPacketsPerLID.size (), false,
939  "num_packets_per_lid");
940 
941  // This is an input array, so we have to copy to device here.
942  // However, we never need to copy it back to host.
943  auto export_lids_d =
945  exportLIDs.getRawPtr (),
946  exportLIDs.size (), true,
947  "export_lids");
948  // This is an input array, so we have to copy to device here.
949  // However, we never need to copy it back to host.
950  auto export_pids_d =
952  sourcePIDs.getRawPtr (),
953  sourcePIDs.size (), true,
954  "export_pids");
955  constexpr bool pack_pids = true;
956  PackCrsGraphImpl::packCrsGraph<LO,GO,NT>(
957  sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
958  export_pids_d, constantNumPackets, pack_pids, distor);
959 
960  // The counts are an output of packCrsGraph, so we
961  // have to copy them back to host.
962  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
963  (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ());
964  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d);
965 }
966 
967 } // namespace Details
968 } // namespace Tpetra
969 
970 #define TPETRA_DETAILS_PACKCRSGRAPH_INSTANT( LO, GO, NT ) \
971  template void \
972  Details::packCrsGraph<LO, GO, NT> ( \
973  const CrsGraph<LO, GO, NT>&, \
974  Teuchos::Array<CrsGraph<LO,GO,NT>::packet_type>&, \
975  const Teuchos::ArrayView<size_t>&, \
976  const Teuchos::ArrayView<const LO>&, \
977  size_t&, \
978  Distributor&); \
979  template void \
980  Details::packCrsGraphNew<LO, GO, NT> ( \
981  const CrsGraph<LO, GO, NT>&, \
982  Kokkos::DualView<CrsGraph<LO,GO,NT>::packet_type*, CrsGraph<LO,GO,NT>::buffer_device_type>&, \
983  const Kokkos::DualView<size_t*, CrsGraph<LO,GO,NT>::buffer_device_type>&, \
984  const Kokkos::DualView<const LO*, NT::device_type>&, \
985  size_t&, \
986  Distributor&); \
987  template void \
988  Details::packCrsGraphWithOwningPIDs<LO, GO, NT> ( \
989  const CrsGraph<LO, GO, NT>&, \
990  Kokkos::DualView<CrsGraph<LO,GO,NT>::packet_type*, CrsGraph<LO,GO,NT>::buffer_device_type>&, \
991  const Teuchos::ArrayView<size_t>&, \
992  const Teuchos::ArrayView<const LO>&, \
993  const Teuchos::ArrayView<const int>&, \
994  size_t&, \
995  Distributor&);
996 
997 #endif // TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
Tpetra_Details_OrdinalTraits.hpp
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Tpetra::Details::PackCrsGraphImpl::NumPacketsAndOffsetsFunctor
Compute the number of packets and offsets for the pack procedure.
Definition: Tpetra_Details_packCrsGraph_def.hpp:111
Tpetra::Details::packCrsGraph
void packCrsGraph(const CrsGraph< LO, GO, NT > &sourceGraph, Teuchos::Array< typename CrsGraph< LO, GO, NT >::packet_type > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets, Distributor &distor)
Pack specified entries of the given local sparse graph for communication.
Definition: Tpetra_Details_packCrsGraph_def.hpp:783
Tpetra::Details::create_mirror_view_from_raw_host_array
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Definition: Tpetra_Details_createMirrorView.hpp:201
Tpetra_CrsGraph_decl.hpp
Declaration of the Tpetra::CrsGraph class.
Tpetra::Details::PackTraits::input_array_type
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
Definition: Tpetra_Details_PackTraits.hpp:89
Tpetra_Details_PackTraits.hpp
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Details
Implementation details of Tpetra.
Tpetra::Details::packCrsGraphWithOwningPIDs
void packCrsGraphWithOwningPIDs(const CrsGraph< LO, GO, NT > &sourceGraph, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets, Distributor &distor)
Pack specified entries of the given local sparse graph for communication.
Definition: Tpetra_Details_packCrsGraph_def.hpp:913
Tpetra_Details_castAwayConstDualView.hpp
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Tpetra::Distributor
Sets up and executes a communication plan for a Tpetra DistObject.
Definition: Tpetra_Distributor.hpp:188
Tpetra::Details::PackCrsGraphImpl::NumPacketsAndOffsetsFunctor::getError
int getError() const
Host function for getting the error.
Definition: Tpetra_Details_packCrsGraph_def.hpp:231
Tpetra_Details_getEntryOnHost.hpp
Declaration and definition of Tpetra::Details::getEntryOnHost.
Tpetra::Classes::DistObject< GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node >::packet_type
::Kokkos::Details::ArithTraits< GlobalOrdinal >::val_type packet_type
The type of each datum being sent or received in an Import or Export.
Definition: Tpetra_DistObject_decl.hpp:361
Tpetra::Details::packCrsGraphNew
void packCrsGraphNew(const CrsGraph< LO, GO, NT > &sourceGraph, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename NT::device_type > &exportLIDs, size_t &constantNumPackets, Distributor &distor)
Pack specified entries of the given local sparse graph for communication, for "new" DistObject interf...
Definition: Tpetra_Details_packCrsGraph_def.hpp:863
Tpetra::Details::LocalMap
::Tpetra::Details::Classes::LocalMap< LocalOrdinal, GlobalOrdinal, DeviceType > LocalMap
Alias for Tpetra::Details::Classes::LocalMap.
Definition: Tpetra_Details_LocalMap_fwd.hpp:72
Tpetra::Classes::CrsGraph::local_graph_type
Kokkos::StaticCrsGraph< LocalOrdinal, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Definition: Tpetra_CrsGraph_decl.hpp:292
Tpetra
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Tpetra::deep_copy
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Definition: Tpetra_MultiVector_decl.hpp:2453
Tpetra::Details::castAwayConstDualView
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
Definition: Tpetra_Details_castAwayConstDualView.hpp:64
Tpetra::Classes::DistObject< GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node >::buffer_device_type
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Definition: Tpetra_DistObject_decl.hpp:710
Tpetra_Details_createMirrorView.hpp
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Tpetra::Classes::CrsGraph
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Definition: Tpetra_CrsGraph_decl.hpp:259