Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
58 
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
62 #include <new> // ::operator new[]
63 #include <algorithm> // std::sort
64 #include <Zoltan2_Util.hpp>
65 #include <vector>
66 
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
69 #else
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
72 
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h" // for error codes
78 #endif
79 #endif
80 
81 #ifdef HAVE_ZOLTAN2_OMP
82 #include <omp.h>
83 #endif
84 
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
87 
88 //if the (last dimension reduce all count) x the mpi world size
89 //estimated to be bigger than this number then migration will be forced
90 //in earlier iterations.
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
92 //if parts right before last dimension are estimated to have less than
93 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
94 #define MIN_WORK_LAST_DIM 1000
95 
96 
97 
98 
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
100 //imbalance calculation. Wreal / Wexpected - 1
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102  (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104  (Wachieved) / (wExpected) - 1
105 
106 
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
108 
109 
110 namespace Teuchos{
111 
116 template <typename Ordinal, typename T>
117 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
118 {
119 private:
120  Ordinal size;
121  T _EPSILON;
122 
123 public:
126  Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
127 
134  Zoltan2_BoxBoundaries (Ordinal s_):
135  size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
136 
139  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
140  {
141  for (Ordinal i=0; i < count; i++){
142  if (Z2_ABS(inBuffer[i]) > _EPSILON){
143  inoutBuffer[i] = inBuffer[i];
144  }
145  }
146  }
147 };
148 } // namespace Teuchos
149 
150 namespace Zoltan2{
151 
155 template <typename T>
156 T *allocMemory(size_t size){
157  if (size > 0){
158  T * a = new T[size];
159  if (a == NULL) {
160  throw "cannot allocate memory";
161  }
162  return a;
163  }
164  else {
165  return NULL;
166  }
167 }
168 
172 template <typename T>
173 void freeArray(T *&array){
174  if(array != NULL){
175  delete [] array;
176  array = NULL;
177  }
178 }
179 
180 
188 template <typename IT, typename CT, typename WT>
190 {
191 public:
192  //TODO: Why volatile?
193  //no idea, another intel compiler faiulure.
194  volatile IT index;
195  volatile CT count;
196  //unsigned int val;
197  volatile WT *val;
198  volatile WT _EPSILON;
199 
201  this->index = 0;
202  this->count = 0;
203  this->val = NULL;
204  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
205  }
206 
207 
208  uMultiSortItem(IT index_ ,CT count_, WT *vals_){
209  this->index = index_;
210  this->count = count_;
211  this->val = vals_;
212  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
213  }
214 
216  this->index = other.index;
217  this->count = other.count;
218  this->val = other.val;
219  this->_EPSILON = other._EPSILON;
220  }
221 
223  //freeArray<WT>(this->val);
224  }
225 
226  void set(IT index_ ,CT count_, WT *vals_){
227  this->index = index_;
228  this->count = count_;
229  this->val = vals_;
230  }
231 
232 
234  this->index = other.index;
235  this->count = other.count;
236  this->val = other.val;
237  return *(this);
238  }
239 
240  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
241  assert (this->count == other.count);
242  for(CT i = 0; i < this->count; ++i){
243  //if the values are equal go to next one.
244  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
245  continue;
246  }
247  //if next value is smaller return true;
248  if(this->val[i] < other.val[i]){
249  return true;
250  }
251  //if next value is bigger return false;
252  else {
253  return false;
254  }
255  }
256  //if they are totally equal.
257  return this->index < other.index;
258  }
259  bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
260  assert (this->count == other.count);
261  for(CT i = 0; i < this->count; ++i){
262  //if the values are equal go to next one.
263  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
264  continue;
265  }
266  //if next value is bigger return true;
267  if(this->val[i] > other.val[i]){
268  return true;
269  }
270  //if next value is smaller return false;
271  else //(this->val[i] > other.val[i])
272  {
273  return false;
274  }
275  }
276  //if they are totally equal.
277  return this->index > other.index;
278  }
279 };// uSortItem;
280 
284 template <class IT, class WT>
285 struct uSortItem
286 {
287  IT id;
288  //unsigned int val;
289  WT val;
290 };// uSortItem;
291 
295 template <class IT, class WT>
296 void uqsort(IT n, uSortItem<IT, WT> * arr)
297 {
298 
299  int NSTACK = 50;
300  int M = 7;
301  IT i, ir=n, j, k, l=1;
302  IT jstack=0, istack[50];
303  WT aval;
304  uSortItem<IT,WT> a, temp;
305 
306  --arr;
307  for (;;)
308  {
309  if (ir-l < M)
310  {
311  for (j=l+1;j<=ir;j++)
312  {
313  a=arr[j];
314  aval = a.val;
315  for (i=j-1;i>=1;i--)
316  {
317  if (arr[i].val <= aval)
318  break;
319  arr[i+1] = arr[i];
320  }
321  arr[i+1]=a;
322  }
323  if (jstack == 0)
324  break;
325  ir=istack[jstack--];
326  l=istack[jstack--];
327  }
328  else
329  {
330  k=(l+ir) >> 1;
331 
332  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
333  if (arr[l+1].val > arr[ir].val)
334  {
335  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
336  }
337  if (arr[l].val > arr[ir].val)
338  {
339  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
340  }
341  if (arr[l+1].val > arr[l].val)
342  {
343  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
344  }
345  i=l+1;
346  j=ir;
347  a=arr[l];
348  aval = a.val;
349  for (;;)
350  {
351  do i++; while (arr[i].val < aval);
352  do j--; while (arr[j].val > aval);
353  if (j < i) break;
354  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
355  }
356  arr[l]=arr[j];
357  arr[j]=a;
358  jstack += 2;
359  if (jstack > NSTACK){
360  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
361  exit(1);
362  }
363  if (ir-i+1 >= j-l)
364  {
365  istack[jstack]=ir;
366  istack[jstack-1]=i;
367  ir=j-1;
368  }
369  else
370  {
371  istack[jstack]=j-1;
372  istack[jstack-1]=l;
373  l=i;
374  }
375  }
376  }
377 }
378 
379 template <class IT, class WT, class SIGN>
381 {
382  IT id;
383  //unsigned int val;
384  WT val;
385  SIGN signbit; // 1 means positive, 0 means negative.
386  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
387  /*if I am negative, the other is positive*/
388  if (this->signbit < rhs.signbit){
389  return true;
390  }
391  /*if both has the same sign*/
392  else if (this->signbit == rhs.signbit){
393 
394  if (this->val < rhs.val){//if my value is smaller,
395  return this->signbit;//then if we both are positive return true.
396  //if we both are negative, return false.
397  }
398  else if (this->val > rhs.val){//if my value is larger,
399  return !this->signbit; //then if we both are positive return false.
400  //if we both are negative, return true.
401  }
402  else { //if both are equal.
403  return false;
404  }
405  }
406  else {
407  /*if I am positive, the other is negative*/
408  return false;
409  }
410 
411  }
412  bool operator>(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
413  /*if I am positive, the other is negative*/
414  if (this->signbit > rhs.signbit){
415  return true;
416  }
417  /*if both has the same sign*/
418  else if (this->signbit == rhs.signbit){
419 
420  if (this->val < rhs.val){//if my value is smaller,
421  return !this->signbit;//then if we both are positive return false.
422  //if we both are negative, return true.
423  }
424  else if (this->val > rhs.val){//if my value is larger,
425  return this->signbit; //then if we both are positive return true.
426  //if we both are negative, return false.
427  }
428  else { // if they are equal
429  return false;
430  }
431  }
432  else {
433  /*if I am negative, the other is positive*/
434  return false;
435  }
436  }
438  return !(*this > rhs);}
440  return !(*this < rhs);}
441 };
442 
446 template <class IT, class WT, class SIGN>
448 
449  IT NSTACK = 50;
450  IT M = 7;
451  IT i, ir=n, j, k, l=1;
452  IT jstack=0, istack[50];
454 
455  --arr;
456  for (;;)
457  {
458  if (ir < M + l)
459  {
460  for (j=l+1;j<=ir;j++)
461  {
462  a=arr[j];
463  for (i=j-1;i>=1;i--)
464  {
465  if (arr[i] <= a)
466  {
467  break;
468  }
469  arr[i+1] = arr[i];
470  }
471  arr[i+1]=a;
472  }
473  if (jstack == 0)
474  break;
475  ir=istack[jstack--];
476  l=istack[jstack--];
477  }
478  else
479  {
480  k=(l+ir) >> 1;
481  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
482  if (arr[l+1] > arr[ir])
483  {
484  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
485  }
486  if (arr[l] > arr[ir])
487  {
488  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
489  }
490  if (arr[l+1] > arr[l])
491  {
492  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
493  }
494  i=l+1;
495  j=ir;
496  a=arr[l];
497  for (;;)
498  {
499  do i++; while (arr[i] < a);
500  do j--; while (arr[j] > a);
501  if (j < i) break;
502  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
503  }
504  arr[l]=arr[j];
505  arr[j]=a;
506  jstack += 2;
507  if (jstack > NSTACK){
508  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
509  exit(1);
510  }
511  if (ir+l+1 >= j+i)
512  {
513  istack[jstack]=ir;
514  istack[jstack-1]=i;
515  ir=j-1;
516  }
517  else
518  {
519  istack[jstack]=j-1;
520  istack[jstack-1]=l;
521  l=i;
522  }
523  }
524  }
525 }
526 
530 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
531  typename mj_part_t>
532 class AlgMJ
533 {
534 private:
536  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
537 
538  RCP<const Environment> mj_env; //the environment object
539  RCP<const Comm<int> > mj_problemComm; //initial comm object
540 
541  double imbalance_tolerance; //input imbalance tolerance.
542  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
543  int recursion_depth; //the number of steps that partitioning will be solved in.
544  int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
545 
546  size_t initial_num_loc_coords; //initial num local coords.
547  global_size_t initial_num_glob_coords; //initial num global coords.
548 
549  mj_lno_t num_local_coords; //number of local coords.
550  mj_gno_t num_global_coords; //number of global coords.
551 
552  mj_scalar_t **mj_coordinates; //two dimension coordinate array
553  mj_scalar_t **mj_weights; //two dimension weight array
554  bool *mj_uniform_parts; //if the target parts are uniform
555  mj_scalar_t **mj_part_sizes; //target part weight sizes.
556  bool *mj_uniform_weights; //if the coordinates have uniform weights.
557 
558  ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
559  size_t num_global_parts; //the targeted number of parts
560 
561  mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
562  mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
563  int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
564 
565  mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
566  mj_lno_t *new_coordinate_permutations; //permutation work array.
567  mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
568 
569  mj_lno_t *part_xadj; //beginning and end of each part.
570  mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
571 
572  //get mj specific parameters.
573  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
574  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
575 
576  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
577  int mj_user_recursion_depth; //the recursion depth value provided by user.
578  bool mj_keep_part_boxes; //if the boxes need to be kept.
579 
580  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
581  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
582  //1 - will aim for minimized number of messages with possibly bad load-imbalance
583  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
584  int num_threads; //num threads
585 
586  mj_part_t total_num_cut ; //how many cuts will be totally
587  mj_part_t total_num_part; //how many parts will be totally
588 
589  mj_part_t max_num_part_along_dim ; //maximum part count along a dimension.
590  mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
591  size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
592 
593  mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done.
594  mj_part_t last_dim_num_part; //max no of parts that might occur
595  //during the partition before the
596  //last partitioning dimension.
597 
598  RCP<Comm<int> > comm; //comm object than can be altered during execution
599  float fEpsilon; //epsilon for float
600  mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
601 
602  mj_scalar_t maxScalar_t; //max possible scalar
603  mj_scalar_t minScalar_t; //min scalar
604 
605  mj_scalar_t *all_cut_coordinates;
606  mj_scalar_t *max_min_coords;
607  mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
608  mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
609 
610  // work array to manipulate coordinate of cutlines in different iterations.
611  //necessary because previous cut line information is used for determining
612  //the next cutline information. therefore, cannot update the cut work array
613  //until all cutlines are determined.
614  mj_scalar_t *cut_coordinates_work_array;
615 
616  //cumulative part weight array.
617  mj_scalar_t *target_part_weights;
618 
619  mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line
620  mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line
621  mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line
622  mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line
623 
624  mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
625  mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
626 
627  //isDone is used to determine if a cutline is determined already.
628  //If a cut line is already determined, the next iterations will skip this cut line.
629  bool *is_cut_line_determined;
630  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
631  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
632  mj_part_t *my_incomplete_cut_count;
633  //local part weights of each thread.
634  double **thread_part_weights;
635  //the work manupulation array for partweights.
636  double **thread_part_weight_work;
637 
638  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
639  mj_scalar_t **thread_cut_left_closest_point;
640  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
641  mj_scalar_t **thread_cut_right_closest_point;
642 
643  //to store how many points in each part a thread has.
644  mj_lno_t **thread_point_counts;
645 
646  mj_scalar_t *process_rectilinear_cut_weight;
647  mj_scalar_t *global_rectilinear_cut_weight;
648 
649  //for faster communication, concatanation of
650  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
651  //leftClosest distances sized P-1, since P-1 cut lines
652  //rightClosest distances size P-1, since P-1 cut lines.
653  mj_scalar_t *total_part_weight_left_right_closests ;
654  mj_scalar_t *global_total_part_weight_left_right_closests;
655 
656  RCP<mj_partBoxVector_t> kept_boxes; // vector of all boxes for all parts;
657  // constructed only if
658  // mj_keep_part_boxes == true
659  RCP<mj_partBox_t> global_box;
660  int myRank, myActualRank; //processor rank, and initial rank
661 
662  bool divide_to_prime_first;
663 
664  /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
665  * the input. part_no_array takes
666  * precedence if both are provided.
667  * Depending on these parameters, total cut/part number,
668  * maximum part/cut number along a dimension, estimated number of reduceAlls,
669  * and the number of parts before the last dimension is calculated.
670  * */
671  void set_part_specifications();
672 
673  /* \brief Tries to determine the part number for current dimension,
674  * by trying to make the partitioning as square as possible.
675  * \param num_total_future how many more partitionings are required.
676  * \param root how many more recursion depth is left.
677  */
678  inline mj_part_t get_part_count(
679  mj_part_t num_total_future,
680  double root);
681 
682  /* \brief Allocates the all required memory for the mj partitioning algorithm.
683  *
684  */
685  void allocate_set_work_memory();
686 
687  /* \brief for part communication we keep track of the box boundaries.
688  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
689  * This function initializes a single box with all global min and max coordinates.
690  * \param initial_partitioning_boxes the input and output vector for boxes.
691  */
692  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
693 
694  /* \brief compute global bounding box: min/max coords of global domain */
695  void compute_global_box();
696 
697  /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
698  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
699  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
700  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
701  *
702  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
703  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
704  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
705  * \param future_num_parts: output, max number of future parts that will be obtained from a single
706  * \param current_num_parts: input, how many parts are there currently.
707  * \param current_iteration: input, current dimension iteration number.
708  * \param input_part_boxes: input, if boxes are kept, current boxes.
709  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
710  */
711  mj_part_t update_part_num_arrays(
712  std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
713  std::vector<mj_part_t> *future_num_part_in_parts,
714  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
715  mj_part_t &future_num_parts,
716  mj_part_t current_num_parts,
717  int current_iteration,
718  RCP<mj_partBoxVector_t> input_part_boxes,
719  RCP<mj_partBoxVector_t> output_part_boxes,
720  mj_part_t atomic_part_count);
721 
733  void mj_get_local_min_max_coord_totW(
734  mj_lno_t coordinate_begin_index,
735  mj_lno_t coordinate_end_index,
736  mj_lno_t *mj_current_coordinate_permutations,
737  mj_scalar_t *mj_current_dim_coords,
738  mj_scalar_t &min_coordinate,
739  mj_scalar_t &max_coordinate,
740  mj_scalar_t &total_weight);
741 
749  void mj_get_global_min_max_coord_totW(
750  mj_part_t current_concurrent_num_parts,
751  mj_scalar_t *local_min_max_total,
752  mj_scalar_t *global_min_max_total);
753 
772  void mj_get_initial_cut_coords_target_weights(
773  mj_scalar_t min_coord,
774  mj_scalar_t max_coord,
775  mj_part_t num_cuts/*p-1*/ ,
776  mj_scalar_t global_weight,
777  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
778  mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
779 
780  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
781  std::vector <mj_part_t> *next_future_num_parts_in_parts,
782  mj_part_t concurrent_current_part,
783  mj_part_t obtained_part_index);
784 
797  void set_initial_coordinate_parts(
798  mj_scalar_t &max_coordinate,
799  mj_scalar_t &min_coordinate,
800  mj_part_t &concurrent_current_part_index,
801  mj_lno_t coordinate_begin_index,
802  mj_lno_t coordinate_end_index,
803  mj_lno_t *mj_current_coordinate_permutations,
804  mj_scalar_t *mj_current_dim_coords,
805  mj_part_t *mj_part_ids,
806  mj_part_t &partition_count);
807 
818  void mj_1D_part(
819  mj_scalar_t *mj_current_dim_coords,
820  mj_scalar_t imbalanceTolerance,
821  mj_part_t current_work_part,
822  mj_part_t current_concurrent_num_parts,
823  mj_scalar_t *current_cut_coordinates,
824  mj_part_t total_incomplete_cut_count,
825  std::vector <mj_part_t> &num_partitioning_in_current_dim);
826 
846  void mj_1D_part_get_thread_part_weights(
847  size_t total_part_count,
848  mj_part_t num_cuts,
849  mj_scalar_t max_coord,
850  mj_scalar_t min_coord,
851  mj_lno_t coordinate_begin_index,
852  mj_lno_t coordinate_end_index,
853  mj_scalar_t *mj_current_dim_coords,
854  mj_scalar_t *temp_current_cut_coords,
855  bool *current_cut_status,
856  double *my_current_part_weights,
857  mj_scalar_t *my_current_left_closest,
858  mj_scalar_t *my_current_right_closest);
859 
867  void mj_accumulate_thread_results(
868  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
869  mj_part_t current_work_part,
870  mj_part_t current_concurrent_num_parts);
871 
902  void mj_get_new_cut_coordinates(
903  const size_t &num_total_part,
904  const mj_part_t &num_cuts,
905  const mj_scalar_t &max_coordinate,
906  const mj_scalar_t &min_coordinate,
907  const mj_scalar_t &global_total_weight,
908  const mj_scalar_t &used_imbalance_tolerance,
909  mj_scalar_t * current_global_part_weights,
910  const mj_scalar_t * current_local_part_weights,
911  const mj_scalar_t *current_part_target_weights,
912  bool *current_cut_line_determined,
913  mj_scalar_t *current_cut_coordinates,
914  mj_scalar_t *current_cut_upper_bounds,
915  mj_scalar_t *current_cut_lower_bounds,
916  mj_scalar_t *current_global_left_closest_points,
917  mj_scalar_t *current_global_right_closest_points,
918  mj_scalar_t * current_cut_lower_bound_weights,
919  mj_scalar_t * current_cut_upper_weights,
920  mj_scalar_t *new_current_cut_coordinates,
921  mj_scalar_t *current_part_cut_line_weight_to_put_left,
922  mj_part_t *rectilinear_cut_count,
923  mj_part_t &my_num_incomplete_cut);
924 
934  void mj_calculate_new_cut_position (
935  mj_scalar_t cut_upper_bound,
936  mj_scalar_t cut_lower_bound,
937  mj_scalar_t cut_upper_weight,
938  mj_scalar_t cut_lower_weight,
939  mj_scalar_t expected_weight,
940  mj_scalar_t &new_cut_position);
941 
952  void mj_create_new_partitions(
953  mj_part_t num_parts,
954  mj_scalar_t *mj_current_dim_coords,
955  mj_scalar_t *current_concurrent_cut_coordinate,
956  mj_lno_t coordinate_begin,
957  mj_lno_t coordinate_end,
958  mj_scalar_t *used_local_cut_line_weight_to_left,
959  double **used_thread_part_weight_work,
960  mj_lno_t *out_part_xadj);
961 
984  bool mj_perform_migration(
985  mj_part_t in_num_parts, //current umb parts
986  mj_part_t &out_num_parts, //output umb parts.
987  std::vector<mj_part_t> *next_future_num_parts_in_parts,
988  mj_part_t &output_part_begin_index,
989  size_t migration_reduce_all_population,
990  mj_lno_t num_coords_for_last_dim_part,
991  std::string iteration,
992  RCP<mj_partBoxVector_t> &input_part_boxes,
993  RCP<mj_partBoxVector_t> &output_part_boxes);
994 
1004  void get_processor_num_points_in_parts(
1005  mj_part_t num_procs,
1006  mj_part_t num_parts,
1007  mj_gno_t *&num_points_in_all_processor_parts);
1008 
1021  bool mj_check_to_migrate(
1022  size_t migration_reduce_all_population,
1023  mj_lno_t num_coords_for_last_dim_part,
1024  mj_part_t num_procs,
1025  mj_part_t num_parts,
1026  mj_gno_t *num_points_in_all_processor_parts);
1027 
1028 
1046  void mj_migration_part_proc_assignment(
1047  mj_gno_t * num_points_in_all_processor_parts,
1048  mj_part_t num_parts,
1049  mj_part_t num_procs,
1050  mj_lno_t *send_count_to_each_proc,
1051  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1052  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1053  mj_part_t &out_num_part,
1054  std::vector<mj_part_t> &out_part_indices,
1055  mj_part_t &output_part_numbering_begin_index,
1056  int *coordinate_destinations);
1057 
1074  void mj_assign_proc_to_parts(
1075  mj_gno_t * num_points_in_all_processor_parts,
1076  mj_part_t num_parts,
1077  mj_part_t num_procs,
1078  mj_lno_t *send_count_to_each_proc,
1079  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1080  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1081  mj_part_t &out_part_index,
1082  mj_part_t &output_part_numbering_begin_index,
1083  int *coordinate_destinations);
1084 
1095  void assign_send_destinations(
1096  mj_part_t num_parts,
1097  mj_part_t *part_assignment_proc_begin_indices,
1098  mj_part_t *processor_chains_in_parts,
1099  mj_lno_t *send_count_to_each_proc,
1100  int *coordinate_destinations);
1101 
1114  void assign_send_destinations2(
1115  mj_part_t num_parts,
1116  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
1117  int *coordinate_destinations,
1118  mj_part_t &output_part_numbering_begin_index,
1119  std::vector<mj_part_t> *next_future_num_parts_in_parts);
1120 
1137  void mj_assign_parts_to_procs(
1138  mj_gno_t * num_points_in_all_processor_parts,
1139  mj_part_t num_parts,
1140  mj_part_t num_procs,
1141  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
1142  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
1143  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
1144  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
1145  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
1146  int *coordinate_destinations);
1147 
1160  void mj_migrate_coords(
1161  mj_part_t num_procs,
1162  mj_lno_t &num_new_local_points,
1163  std::string iteration,
1164  int *coordinate_destinations,
1165  mj_part_t num_parts);
1166 
1173  void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1174 
1175 
1181  void fill_permutation_array(
1182  mj_part_t output_num_parts,
1183  mj_part_t num_parts);
1184 
1193  void set_final_parts(
1194  mj_part_t current_num_parts,
1195  mj_part_t output_part_begin_index,
1196  RCP<mj_partBoxVector_t> &output_part_boxes,
1197  bool is_data_ever_migrated);
1200  void free_work_memory();
1214  void create_consistent_chunks(
1215  mj_part_t num_parts,
1216  mj_scalar_t *mj_current_dim_coords,
1217  mj_scalar_t *current_concurrent_cut_coordinate,
1218  mj_lno_t coordinate_begin,
1219  mj_lno_t coordinate_end,
1220  mj_scalar_t *used_local_cut_line_weight_to_left,
1221  mj_lno_t *out_part_xadj,
1222  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1223 
1228  mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1229  mj_part_t largest_factor = 1;
1230  mj_part_t n = num_parts;
1231  mj_part_t divisor = 2;
1232  while (n > 1){
1233  while (n % divisor == 0){
1234  n = n / divisor;
1235  largest_factor = divisor;
1236  }
1237  ++divisor;
1238  if (divisor * divisor > n){
1239  if (n > 1){
1240  largest_factor = n;
1241  }
1242  break;
1243  }
1244  }
1245  return largest_factor;
1246  }
1247 public:
1248  AlgMJ();
1249 
1278  void multi_jagged_part(
1279  const RCP<const Environment> &env,
1280  RCP<const Comm<int> > &problemComm,
1281 
1282  double imbalance_tolerance,
1283  size_t num_global_parts,
1284  mj_part_t *part_no_array,
1285  int recursion_depth,
1286 
1287  int coord_dim,
1288  mj_lno_t num_local_coords,
1289  mj_gno_t num_global_coords,
1290  const mj_gno_t *initial_mj_gnos,
1291  mj_scalar_t **mj_coordinates,
1292 
1293  int num_weights_per_coord,
1294  bool *mj_uniform_weights,
1295  mj_scalar_t **mj_weights,
1296  bool *mj_uniform_parts,
1297  mj_scalar_t **mj_part_sizes,
1298 
1299  mj_part_t *&result_assigned_part_ids,
1300  mj_gno_t *&result_mj_gnos
1301 
1302  );
1312  bool distribute_points_on_cut_lines_,
1313  int max_concurrent_part_calculation_,
1314  int check_migrate_avoid_migration_option_,
1315  mj_scalar_t minimum_migration_imbalance_, int migration_type_ = 0);
1319  void set_to_keep_part_boxes();
1320 
1323  RCP<mj_partBox_t> get_global_box() const;
1324 
1325  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1326 
1327  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1328  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1329 
1355  const RCP<const Environment> &env,
1356  mj_lno_t num_total_coords,
1357  mj_lno_t num_selected_coords,
1358  size_t num_target_part,
1359  int coord_dim,
1360  mj_scalar_t **mj_coordinates,
1361  mj_lno_t *initial_selected_coords_output_permutation,
1362  mj_lno_t *output_xadj,
1363  int recursion_depth,
1364  const mj_part_t *part_no_array,
1365  bool partition_along_longest_dim,
1366  int num_ranks_per_node,
1367  bool divide_to_prime_first_);
1368 
1369 };
1370 
1395 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1396  typename mj_part_t>
1398  const RCP<const Environment> &env,
1399  mj_lno_t num_total_coords,
1400  mj_lno_t num_selected_coords,
1401  size_t num_target_part,
1402  int coord_dim_,
1403  mj_scalar_t **mj_coordinates_,
1404  mj_lno_t *inital_adjList_output_adjlist,
1405  mj_lno_t *output_xadj,
1406  int rd,
1407  const mj_part_t *part_no_array_,
1408  bool partition_along_longest_dim,
1409  int num_ranks_per_node,
1410  bool divide_to_prime_first_
1411 ){
1412 
1413 
1414  this->mj_env = env;
1415  const RCP<Comm<int> > commN;
1416  this->mj_problemComm =
1417  Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1418  this->comm =
1419  Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1420  this->myActualRank = this->myRank = 1;
1421 
1422 #ifdef HAVE_ZOLTAN2_OMP
1423  //int actual_num_threads = omp_get_num_threads();
1424  //omp_set_num_threads(1);
1425 #endif
1426 
1427  this->divide_to_prime_first = divide_to_prime_first_;
1428  //weights are uniform for task mapping
1429 
1430  //parts are uniform for task mapping
1431  //as input indices.
1432  this->imbalance_tolerance = 0;
1433  this->num_global_parts = num_target_part;
1434  this->part_no_array = (mj_part_t *)part_no_array_;
1435  this->recursion_depth = rd;
1436 
1437  this->coord_dim = coord_dim_;
1438  this->num_local_coords = num_total_coords;
1439  this->num_global_coords = num_total_coords;
1440  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
1441 
1444  this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1445 
1446  this->num_weights_per_coord = 0;
1447  bool *tmp_mj_uniform_weights = new bool[1];
1448  this->mj_uniform_weights = tmp_mj_uniform_weights ;
1449  this->mj_uniform_weights[0] = true;
1450 
1451  mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
1452  this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
1453 
1454  bool *tmp_mj_uniform_parts = new bool[1];
1455  this->mj_uniform_parts = tmp_mj_uniform_parts;
1456  this->mj_uniform_parts[0] = true;
1457 
1458  mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
1459  this->mj_part_sizes = tmp_mj_part_sizes;
1460  this->mj_part_sizes[0] = NULL;
1461 
1462  this->num_threads = 1;
1463  this->set_part_specifications();
1464 
1465  this->allocate_set_work_memory();
1466  //the end of the initial partition is the end of coordinates.
1467  this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1468  for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1469  this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1470  }
1471 
1472  mj_part_t current_num_parts = 1;
1473 
1474  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1475 
1476  mj_part_t future_num_parts = this->total_num_part;
1477 
1478  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
1479  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
1480  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1481  RCP<mj_partBoxVector_t> t1;
1482  RCP<mj_partBoxVector_t> t2;
1483 
1484 
1485  std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1486  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted = &(coord_dimension_range_sorted[0]);
1487  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1488  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1489 
1490  for (int i = 0; i < this->recursion_depth; ++i){
1491 
1492  //partitioning array. size will be as the number of current partitions and this
1493  //holds how many parts that each part will be in the current dimension partitioning.
1494  std::vector <mj_part_t> num_partitioning_in_current_dim;
1495 
1496  //number of parts that will be obtained at the end of this partitioning.
1497  //future_num_part_in_parts is as the size of current number of parts.
1498  //holds how many more parts each should be divided in the further
1499  //iterations. this will be used to calculate num_partitioning_in_current_dim,
1500  //as the number of parts that the part will be partitioned
1501  //in the current dimension partitioning.
1502 
1503  //next_future_num_parts_in_parts will be as the size of outnumParts,
1504  //and this will hold how many more parts that each output part
1505  //should be divided. this array will also be used to determine the weight ratios
1506  //of the parts.
1507  //swap the arrays to use iteratively..
1508  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1509  future_num_part_in_parts = next_future_num_parts_in_parts;
1510  next_future_num_parts_in_parts = tmpPartVect;
1511 
1512  //clear next_future_num_parts_in_parts array as
1513  //getPartitionArrays expects it to be empty.
1514  //it also expects num_partitioning_in_current_dim to be empty as well.
1515  next_future_num_parts_in_parts->clear();
1516 
1517 
1518  //returns the total number of output parts for this dimension partitioning.
1519  mj_part_t output_part_count_in_dimension =
1520  this->update_part_num_arrays(
1521  num_partitioning_in_current_dim,
1522  future_num_part_in_parts,
1523  next_future_num_parts_in_parts,
1524  future_num_parts,
1525  current_num_parts,
1526  i,
1527  t1,
1528  t2, num_ranks_per_node);
1529 
1530  //if the number of obtained parts equal to current number of parts,
1531  //skip this dimension. For example, this happens when 1 is given in the input
1532  //part array is given. P=4,5,1,2
1533  if(output_part_count_in_dimension == current_num_parts) {
1534  tmpPartVect= future_num_part_in_parts;
1535  future_num_part_in_parts = next_future_num_parts_in_parts;
1536  next_future_num_parts_in_parts = tmpPartVect;
1537  continue;
1538  }
1539 
1540  //convert i to string to be used for debugging purposes.
1541  std::string istring = Teuchos::toString<int>(i);
1542 
1543  //alloc Memory to point the indices
1544  //of the parts in the permutation array.
1545  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1546 
1547  //the index where in the outtotalCounts will be written.
1548  mj_part_t output_part_index = 0;
1549  //whatever is written to outTotalCounts will be added with previousEnd
1550  //so that the points will be shifted.
1551  mj_part_t output_coordinate_end_index = 0;
1552 
1553  mj_part_t current_work_part = 0;
1554  mj_part_t current_concurrent_num_parts = 1;
1555 
1556  mj_part_t obtained_part_index = 0;
1557 
1558  //get the coordinate axis along which the partitioning will be done.
1559  int coordInd = i % this->coord_dim;
1560  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1561 
1562 
1563  //run for all available parts.
1564  for (; current_work_part < current_num_parts;
1565  current_work_part += current_concurrent_num_parts){
1566 
1567 
1568  //current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1569  //this->max_concurrent_part_calculation);
1570 
1571  mj_part_t actual_work_part_count = 0;
1572  //initialization for 1D partitioning.
1573  //get the min and max coordinates of each part
1574  //together with the part weights of each part.
1575  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1576  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1577 
1578  //if this part wont be partitioned any further
1579  //dont do any work for this part.
1580  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1581  continue;
1582  }
1583  ++actual_work_part_count;
1584  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1585  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1586 
1587  /*
1588  std::cout << "i:" << i << " j:" << current_work_part + kk
1589  << " coordinate_begin_index:" << coordinate_begin_index
1590  << " coordinate_end_index:" << coordinate_end_index
1591  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
1592  */
1593 
1594 
1595  if(partition_along_longest_dim){
1596 
1597  mj_scalar_t best_weight_coord = 0;
1598  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1599  mj_scalar_t best_min_coord = 0;
1600  mj_scalar_t best_max_coord = 0;
1601  //MD:same for all coordinates, but I will still use this for now.
1602 
1603  this->mj_get_local_min_max_coord_totW(
1604  coordinate_begin_index,
1605  coordinate_end_index,
1606  this->coordinate_permutations,
1607  this->mj_coordinates[coord_traverse_ind],
1608  best_min_coord, //min coordinate
1609  best_max_coord, //max coordinate
1610  best_weight_coord //total weight);
1611  );
1612 
1613  coord_dim_mins[coord_traverse_ind] = best_min_coord;
1614  coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1615  mj_scalar_t best_range = best_max_coord - best_min_coord;
1616  coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1617  coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1618  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1619  }
1620 
1621 
1622  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1623  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1624 
1625  /*
1626  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1627  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " range:" << p_coord_dimension_range_sorted[coord_traverse_ind].val << std::endl;
1628  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " coord_dim_mins:" << coord_dim_mins[p_coord_dimension_range_sorted[coord_traverse_ind].id]<< std::endl;
1629  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id << " coord_dim_maxs:" << coord_dim_maxs[p_coord_dimension_range_sorted[coord_traverse_ind].id] << std::endl;
1630 
1631  }
1632  */
1633 
1634  mj_current_dim_coords = this->mj_coordinates[coordInd];
1635 
1636  this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1637  this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1638  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1639 
1640  }
1641  else{
1642  this->mj_get_local_min_max_coord_totW(
1643  coordinate_begin_index,
1644  coordinate_end_index,
1645  this->coordinate_permutations,
1646  mj_current_dim_coords,
1647  this->process_local_min_max_coord_total_weight[kk], //min coordinate
1648  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
1649  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
1650  );
1651  }
1652  }
1653 
1654  //1D partitioning
1655  if (actual_work_part_count > 0){
1656  //obtain global Min max of the part.
1657  this->mj_get_global_min_max_coord_totW(
1658  current_concurrent_num_parts,
1659  this->process_local_min_max_coord_total_weight,
1660  this->global_min_max_coord_total_weight);
1661 
1662  //represents the total number of cutlines
1663  //whose coordinate should be determined.
1664  mj_part_t total_incomplete_cut_count = 0;
1665 
1666  //Compute weight ratios for parts & cuts:
1667  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
1668  //part0 cut0 part1 cut1 part2 cut2 part3
1669  mj_part_t concurrent_part_cut_shift = 0;
1670  mj_part_t concurrent_part_part_shift = 0;
1671 
1672 
1673  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1674  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1675  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1676  current_concurrent_num_parts];
1677  mj_scalar_t global_total_weight =
1678  this->global_min_max_coord_total_weight[kk +
1679  2 * current_concurrent_num_parts];
1680 
1681  mj_part_t concurrent_current_part_index = current_work_part + kk;
1682 
1683  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1684 
1685  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1686  mj_scalar_t *current_target_part_weights = this->target_part_weights +
1687  concurrent_part_part_shift;
1688  //shift the usedCutCoordinate array as noCuts.
1689  concurrent_part_cut_shift += partition_count - 1;
1690  //shift the partRatio array as noParts.
1691  concurrent_part_part_shift += partition_count;
1692 
1693  //calculate only if part is not empty,
1694  //and part will be further partitioend.
1695  if(partition_count > 1 && min_coordinate <= max_coordinate){
1696 
1697  //increase allDone by the number of cuts of the current
1698  //part's cut line number.
1699  total_incomplete_cut_count += partition_count - 1;
1700  //set the number of cut lines that should be determined
1701  //for this part.
1702  this->my_incomplete_cut_count[kk] = partition_count - 1;
1703 
1704  //get the target weights of the parts.
1705  this->mj_get_initial_cut_coords_target_weights(
1706  min_coordinate,
1707  max_coordinate,
1708  partition_count - 1,
1709  global_total_weight,
1710  usedCutCoordinate,
1711  current_target_part_weights,
1712  future_num_part_in_parts,
1713  next_future_num_parts_in_parts,
1714  concurrent_current_part_index,
1715  obtained_part_index);
1716 
1717  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1718  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1719 
1720  //get the initial estimated part assignments of the coordinates.
1721  this->set_initial_coordinate_parts(
1722  max_coordinate,
1723  min_coordinate,
1724  concurrent_current_part_index,
1725  coordinate_begin_index, coordinate_end_index,
1726  this->coordinate_permutations,
1727  mj_current_dim_coords,
1728  this->assigned_part_ids,
1729  partition_count);
1730 
1731  }
1732  else {
1733  // e.g., if have fewer coordinates than parts, don't need to do next dim.
1734  this->my_incomplete_cut_count[kk] = 0;
1735  }
1736  obtained_part_index += partition_count;
1737  }
1738 
1739  //used imbalance, it is always 0, as it is difficult to estimate a range.
1740  mj_scalar_t used_imbalance = 0;
1741 
1742 
1743  // Determine cut lines for k parts here.
1744  this->mj_1D_part(
1745  mj_current_dim_coords,
1746  used_imbalance,
1747  current_work_part,
1748  current_concurrent_num_parts,
1749  current_cut_coordinates,
1750  total_incomplete_cut_count,
1751  num_partitioning_in_current_dim);
1752  }
1753  else {
1754  obtained_part_index += current_concurrent_num_parts;
1755  }
1756 
1757  //create part chunks
1758  {
1759 
1760  mj_part_t output_array_shift = 0;
1761  mj_part_t cut_shift = 0;
1762  size_t tlr_shift = 0;
1763  size_t partweight_array_shift = 0;
1764 
1765  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1766  mj_part_t current_concurrent_work_part = current_work_part + kk;
1767  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1768 
1769  //if the part is empty, skip the part.
1770  if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1771  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1772 
1773  for(mj_part_t jj = 0; jj < num_parts; ++jj){
1774  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1775  }
1776  cut_shift += num_parts - 1;
1777  tlr_shift += (4 *(num_parts - 1) + 1);
1778  output_array_shift += num_parts;
1779  partweight_array_shift += (2 * (num_parts - 1) + 1);
1780  continue;
1781  }
1782 
1783  mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1784  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1785  -1];
1786  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1787  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1788  cut_shift;
1789 
1790  for(int ii = 0; ii < this->num_threads; ++ii){
1791  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1792  }
1793 
1794  if(num_parts > 1){
1795  // Rewrite the indices based on the computed cuts.
1796  this->create_consistent_chunks(
1797  num_parts,
1798  mj_current_dim_coords,
1799  current_concurrent_cut_coordinate,
1800  coordinate_begin,
1801  coordinate_end,
1802  used_local_cut_line_weight_to_left,
1803  this->new_part_xadj + output_part_index + output_array_shift,
1804  coordInd,
1805  partition_along_longest_dim,
1806  p_coord_dimension_range_sorted);
1807  }
1808  else {
1809  //if this part is partitioned into 1 then just copy
1810  //the old values.
1811  mj_lno_t part_size = coordinate_end - coordinate_begin;
1812  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1813  memcpy(this->new_coordinate_permutations + coordinate_begin,
1814  this->coordinate_permutations + coordinate_begin,
1815  part_size * sizeof(mj_lno_t));
1816  }
1817 
1818 
1819 
1820  cut_shift += num_parts - 1;
1821  tlr_shift += (4 *(num_parts - 1) + 1);
1822  output_array_shift += num_parts;
1823  partweight_array_shift += (2 * (num_parts - 1) + 1);
1824  }
1825 
1826  //shift cut coordinates so that all cut coordinates are stored.
1827  //current_cut_coordinates += cutShift;
1828 
1829  //getChunks from coordinates partitioned the parts and
1830  //wrote the indices as if there were a single part.
1831  //now we need to shift the beginning indices.
1832  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1833  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1834  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1835  //shift it by previousCount
1836  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1837  if (ii % 2 == 1){
1838  mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1839  mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1840 
1841  for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1842  mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1843  //MARKER: FLIPPED ZORDER BELOW
1844  mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1845  }
1846  }
1847  }
1848  //increase the previous count by current end.
1849  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1850  //increase the current out.
1851  output_part_index += num_parts ;
1852  }
1853  }
1854  }
1855  // end of this partitioning dimension
1856 
1857  //set the current num parts for next dim partitioning
1858  current_num_parts = output_part_count_in_dimension;
1859 
1860  //swap the coordinate permutations for the next dimension.
1861  mj_lno_t * tmp = this->coordinate_permutations;
1862  this->coordinate_permutations = this->new_coordinate_permutations;
1863  this->new_coordinate_permutations = tmp;
1864 
1865  freeArray<mj_lno_t>(this->part_xadj);
1866  this->part_xadj = this->new_part_xadj;
1867  this->new_part_xadj = NULL;
1868  }
1869 
1870  for(mj_lno_t i = 0; i < num_total_coords; ++i){
1871  inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1872  }
1873 
1874  // Return output_xadj in CSR format
1875  output_xadj[0] = 0;
1876  for(size_t i = 0; i < this->num_global_parts ; ++i){
1877  output_xadj[i+1] = this->part_xadj[i];
1878  }
1879 
1880  delete future_num_part_in_parts;
1881  delete next_future_num_parts_in_parts;
1882 
1883  //free the extra memory that we allocated.
1884  freeArray<mj_part_t>(this->assigned_part_ids);
1885  freeArray<mj_gno_t>(this->initial_mj_gnos);
1886  freeArray<mj_gno_t>(this->current_mj_gnos);
1887  freeArray<bool>(tmp_mj_uniform_weights);
1888  freeArray<bool>(tmp_mj_uniform_parts);
1889  freeArray<mj_scalar_t *>(tmp_mj_weights);
1890  freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1891 
1892  this->free_work_memory();
1893 
1894 #ifdef HAVE_ZOLTAN2_OMP
1895  //omp_set_num_threads(actual_num_threads);
1896 #endif
1897 }
1898 
1902 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1903  typename mj_part_t>
1905  mj_env(), mj_problemComm(), imbalance_tolerance(0),
1906  part_no_array(NULL), recursion_depth(0), coord_dim(0),
1907  num_weights_per_coord(0), initial_num_loc_coords(0),
1908  initial_num_glob_coords(0),
1909  num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1910  mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1911  mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1912  initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1913  coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1914  assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1915  distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1916  mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
1917  check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
1918  num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1919  max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1920  last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1921  all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1922  thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1923  target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1924  cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1925  process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1926  is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1927  thread_part_weights(NULL), thread_part_weight_work(NULL),
1928  thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1929  thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1930  global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1931  global_total_part_weight_left_right_closests(NULL),
1932  kept_boxes(),global_box(),
1933  myRank(0), myActualRank(0), divide_to_prime_first(false)
1934 {
1935  this->fEpsilon = std::numeric_limits<float>::epsilon();
1936  this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
1937 
1938  this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1939  this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1940 
1941 }
1942 
1943 
1947 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1948  typename mj_part_t>
1949 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1951 {
1952  return this->global_box;
1953 }
1954 
1958 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1959  typename mj_part_t>
1961  this->mj_keep_part_boxes = true;
1962 }
1963 
1964 
1965 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
1966  * the input. part_no_array takes
1967  * precedence if both are provided.
1968  * Depending on these parameters, total cut/part number,
1969  * maximum part/cut number along a dimension, estimated number of reduceAlls,
1970  * and the number of parts before the last dimension is calculated.
1971  * */
1972 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1973  typename mj_part_t>
1975 
1976  this->total_num_cut = 0; //how many cuts will be totally
1977  this->total_num_part = 1; //how many parts will be totally
1978  this->max_num_part_along_dim = 0; //maximum part count along a dimension.
1979  this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done.
1980  this->last_dim_num_part = 1; //max no of parts that might occur
1981  //during the partition before the
1982  //last partitioning dimension.
1983  this->max_num_cut_along_dim = 0;
1984  this->max_num_total_part_along_dim = 0;
1985 
1986  if (this->part_no_array){
1987  //if user provided part array, traverse the array and set variables.
1988  for (int i = 0; i < this->recursion_depth; ++i){
1989  this->total_dim_num_reduce_all += this->total_num_part;
1990  this->total_num_part *= this->part_no_array[i];
1991  if(this->part_no_array[i] > this->max_num_part_along_dim) {
1992  this->max_num_part_along_dim = this->part_no_array[i];
1993  }
1994  }
1995  this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1996  this->num_global_parts = this->total_num_part;
1997  } else {
1998  mj_part_t future_num_parts = this->num_global_parts;
1999 
2000  //we need to calculate the part numbers now, to determine the maximum along the dimensions.
2001  for (int i = 0; i < this->recursion_depth; ++i){
2002 
2003  mj_part_t maxNoPartAlongI = this->get_part_count(
2004  future_num_parts, 1.0f / (this->recursion_depth - i));
2005 
2006  if (maxNoPartAlongI > this->max_num_part_along_dim){
2007  this->max_num_part_along_dim = maxNoPartAlongI;
2008  }
2009 
2010  mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
2011  if (future_num_parts % maxNoPartAlongI){
2012  ++nfutureNumParts;
2013  }
2014  future_num_parts = nfutureNumParts;
2015  }
2016  this->total_num_part = this->num_global_parts;
2017 
2018  if (this->divide_to_prime_first){
2019  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2020  this->last_dim_num_part = this->num_global_parts;
2021  }
2022  else {
2023  //this is the lower bound.
2024 
2025  //estimate reduceAll Count here.
2026  //we find the upperbound instead.
2027  size_t p = 1;
2028 
2029  for (int i = 0; i < this->recursion_depth; ++i){
2030  this->total_dim_num_reduce_all += p;
2031  p *= this->max_num_part_along_dim;
2032  }
2033 
2034  if (p / this->max_num_part_along_dim > this->num_global_parts){
2035  this->last_dim_num_part = this->num_global_parts;
2036  }
2037  else {
2038  this->last_dim_num_part = p / this->max_num_part_along_dim;
2039  }
2040 
2041  }
2042  }
2043 
2044  this->total_num_cut = this->total_num_part - 1;
2045  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2046  this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2047  //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2048 
2049  //refine the concurrent part count, if it is given bigger than the maximum possible part count.
2050  if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2051  if(this->mj_problemComm->getRank() == 0){
2052  std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2053  ") has been set bigger than maximum amount that can be used." <<
2054  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2055  }
2056  this->max_concurrent_part_calculation = this->last_dim_num_part;
2057  }
2058 
2059 }
2060 /* \brief Tries to determine the part number for current dimension,
2061  * by trying to make the partitioning as square as possible.
2062  * \param num_total_future how many more partitionings are required.
2063  * \param root how many more recursion depth is left.
2064  */
2065 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2066  typename mj_part_t>
2067 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2068  mj_part_t num_total_future,
2069  double root)
2070 {
2071  double fp = pow(num_total_future, root);
2072  mj_part_t ip = mj_part_t (fp);
2073  if (fp - ip < this->fEpsilon * 100){
2074  return ip;
2075  }
2076  else {
2077  return ip + 1;
2078  }
2079 }
2080 
2081 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
2082  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
2083  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
2084  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
2085  *
2086  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
2087  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
2088  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
2089  * \param future_num_parts: output, max number of future parts that will be obtained from a single
2090  * \param current_num_parts: input, how many parts are there currently.
2091  * \param current_iteration: input, current dimension iteration number.
2092  * \param input_part_boxes: input, if boxes are kept, current boxes.
2093  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
2094  */
2095 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2096  typename mj_part_t>
2097 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2098  std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
2099  std::vector<mj_part_t> *future_num_part_in_parts,
2100  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
2101  mj_part_t &future_num_parts,
2102  mj_part_t current_num_parts,
2103  int current_iteration,
2104  RCP<mj_partBoxVector_t> input_part_boxes,
2105  RCP<mj_partBoxVector_t> output_part_boxes,
2106  mj_part_t atomic_part_count
2107 ){
2108  //how many parts that will be obtained after this dimension.
2109  mj_part_t output_num_parts = 0;
2110  if(this->part_no_array){
2111  //when the partNo array is provided as input,
2112  //each current partition will be partition to the same number of parts.
2113  //we dont need to use the future_num_part_in_parts vector in this case.
2114 
2115  mj_part_t p = this->part_no_array[current_iteration];
2116  if (p < 1){
2117  std::cout << "i:" << current_iteration << " p is given as:" << p << std::endl;
2118  exit(1);
2119  }
2120  if (p == 1){
2121  return current_num_parts;
2122  }
2123 
2124  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2125  num_partitioning_in_current_dim.push_back(p);
2126  }
2127  //std::cout << "me:" << this->myRank << " current_iteration" << current_iteration <<
2128  //" current_num_parts:" << current_num_parts << std::endl;
2129  //std::cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << std::endl;
2130  //set the new value of future_num_parts.
2131 
2132  /*
2133  std::cout << "\tfuture_num_parts:" << future_num_parts
2134  << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0]
2135  << future_num_parts/ num_partitioning_in_current_dim[0] << std::endl;
2136  */
2137 
2138  future_num_parts /= num_partitioning_in_current_dim[0];
2139  output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2140 
2141  if (this->mj_keep_part_boxes){
2142  for (mj_part_t k = 0; k < current_num_parts; ++k){
2143  //initialized the output boxes as its ancestor.
2144  for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2145  output_part_boxes->push_back((*input_part_boxes)[k]);
2146  }
2147  }
2148  }
2149 
2150  //set the how many more parts each part will be divided.
2151  //this is obvious when partNo array is provided as input.
2152  //however, fill this so that weights will be calculated according to this array.
2153  for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2154  next_future_num_parts_in_parts->push_back(future_num_parts);
2155  }
2156  }
2157  else {
2158  //if partNo array is not provided as input,
2159  //future_num_part_in_parts holds how many parts each part should be divided.
2160  //initially it holds a single number equal to the total number of global parts.
2161 
2162  //calculate the future_num_parts from beginning,
2163  //since each part might be divided into different number of parts.
2164  future_num_parts = 1;
2165 
2166  //std::cout << "i:" << i << std::endl;
2167 
2168  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2169  //get how many parts a part should be divided.
2170  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2171 
2172  //get the ideal number of parts that is close to the
2173  //(recursion_depth - i) root of the future_num_parts_of_part_ii.
2174  mj_part_t num_partitions_in_current_dim =
2175  this->get_part_count(
2176  future_num_parts_of_part_ii,
2177  1.0 / (this->recursion_depth - current_iteration)
2178  );
2179 
2180  if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2181  std::cerr << "ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2182  << num_partitions_in_current_dim << "this->max_num_part_along_dim:"
2183  << this->max_num_part_along_dim <<
2184  " this->recursion_depth:" << this->recursion_depth <<
2185  " current_iteration:" << current_iteration <<
2186  " future_num_parts_of_part_ii:" << future_num_parts_of_part_ii <<
2187  " might need to fix max part no calculation for largest_prime_first partitioning" <<
2188  std::endl;
2189  exit(1);
2190  }
2191  //add this number to num_partitioning_in_current_dim vector.
2192  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2193 
2194  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2195  if (this->divide_to_prime_first){
2196 
2197  //increase the output number of parts.
2198  output_num_parts += num_partitions_in_current_dim;
2199  if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2200  atomic_part_count = 1;
2201  }
2202 
2203  largest_prime_factor = this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2204 
2205  //we divide to num_partitions_in_current_dim. But we adjust the weights based on largest prime/
2206  //if num_partitions_in_current_dim = 2, largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2207  //if the largest prime is less than part count, we use the part count so that we divide uniformly.
2208  if (largest_prime_factor < num_partitions_in_current_dim){
2209  largest_prime_factor = num_partitions_in_current_dim;
2210  }
2211 
2212  //ideal number of future partitions for each part.
2213  mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2214  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2215  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2216 
2217  //std::cout << "current num part:" << ii << " largest_prime_factor:" << largest_prime_factor << " To Partition:" << future_num_parts_of_part_ii << " ";
2218  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2219  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2220  mj_part_t my_ideal_primescale = ideal_prime_scale;
2221  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2222  if (iii < (largest_prime_factor) % num_partitions_in_current_dim){
2223  ++my_ideal_primescale;
2224  }
2225  //scale with 'x';
2226  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part * my_ideal_primescale;
2227 
2228  //if there is a remainder in the part increase the part weight.
2229  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor){
2230  //if not uniform, add 1 for the extra parts.
2231  ++num_future_parts_for_part_iii;
2232  }
2233 
2234  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2235 
2236  //if part boxes are stored, initialize the box of the parts as the ancestor.
2237  if (this->mj_keep_part_boxes){
2238  output_part_boxes->push_back((*input_part_boxes)[ii]);
2239  }
2240 
2241  //set num future_num_parts to maximum in this part.
2242  if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2243 
2244  }
2245 
2246 
2247  }
2248  else {
2249 
2250  //increase the output number of parts.
2251  output_num_parts += num_partitions_in_current_dim;
2252 
2253 
2254 
2255  if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2256  atomic_part_count = 1;
2257  }
2258  //ideal number of future partitions for each part.
2259  mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2260  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2261  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2262 
2263  //if there is a remainder in the part increase the part weight.
2264  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2265  //if not uniform, add 1 for the extra parts.
2266  ++num_future_parts_for_part_iii;
2267  }
2268 
2269  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2270 
2271  //if part boxes are stored, initialize the box of the parts as the ancestor.
2272  if (this->mj_keep_part_boxes){
2273  output_part_boxes->push_back((*input_part_boxes)[ii]);
2274  }
2275 
2276  //set num future_num_parts to maximum in this part.
2277  if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2278  }
2279  }
2280  }
2281  }
2282  return output_num_parts;
2283 }
2284 
2285 
2286 /* \brief Allocates and initializes the work memory that will be used by MJ.
2287  *
2288  * */
2289 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2290  typename mj_part_t>
2291 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2292 
2293  //points to process that initially owns the coordinate.
2294  this->owner_of_coordinate = NULL;
2295 
2296  //Throughout the partitioning execution,
2297  //instead of the moving the coordinates, hold a permutation array for parts.
2298  //coordinate_permutations holds the current permutation.
2299  this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2300  //initial configuration, set each pointer-i to i.
2301 #ifdef HAVE_ZOLTAN2_OMP
2302 #pragma omp parallel for
2303 #endif
2304  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2305  this->coordinate_permutations[i] = i;
2306  }
2307 
2308  //new_coordinate_permutations holds the current permutation.
2309  this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2310 
2311  this->assigned_part_ids = NULL;
2312  if(this->num_local_coords > 0){
2313  this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2314  }
2315 
2316  //single partition starts at index-0, and ends at numLocalCoords
2317  //inTotalCounts array holds the end points in coordinate_permutations array
2318  //for each partition. Initially sized 1, and single element is set to numLocalCoords.
2319  this->part_xadj = allocMemory<mj_lno_t>(1);
2320  this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
2321  //the ends points of the output, this is allocated later.
2322  this->new_part_xadj = NULL;
2323 
2324  // only store this much if cuts are needed to be stored.
2325  //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
2326 
2327 
2328  this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2329 
2330  this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2331 
2332  this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
2333  this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
2334  //distribute_points_on_cut_lines = false;
2335  if(this->distribute_points_on_cut_lines){
2336  this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2337  this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2338  for(int i = 0; i < this->num_threads; ++i){
2339  this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2340  }
2341  this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2342  this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2343  }
2344 
2345 
2346  // work array to manipulate coordinate of cutlines in different iterations.
2347  //necessary because previous cut line information is used for determining
2348  //the next cutline information. therefore, cannot update the cut work array
2349  //until all cutlines are determined.
2350  this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2351  this->max_concurrent_part_calculation);
2352 
2353 
2354  //cumulative part weight array.
2355  this->target_part_weights = allocMemory<mj_scalar_t>(
2356  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2357  // the weight from left to write.
2358 
2359  this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line
2360  this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line
2361  this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line
2362  this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line
2363 
2364  this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
2365  this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
2366 
2367  //is_cut_line_determined is used to determine if a cutline is determined already.
2368  //If a cut line is already determined, the next iterations will skip this cut line.
2369  this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2370  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
2371  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
2372  this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2373  //local part weights of each thread.
2374  this->thread_part_weights = allocMemory<double *>(this->num_threads);
2375  //the work manupulation array for partweights.
2376  this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2377 
2378  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
2379  this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2380  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
2381  this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2382 
2383  //to store how many points in each part a thread has.
2384  this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2385 
2386  for(int i = 0; i < this->num_threads; ++i){
2387  //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
2388  this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2389  this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2390  this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2391  this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2392  }
2393  //for faster communication, concatanation of
2394  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2395  //leftClosest distances sized P-1, since P-1 cut lines
2396  //rightClosest distances size P-1, since P-1 cut lines.
2397  this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2398  this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2399 
2400 
2401  mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2402  for (int i=0; i < this->coord_dim; i++){
2403  coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2404 #ifdef HAVE_ZOLTAN2_OMP
2405 #pragma omp parallel for
2406 #endif
2407  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2408  coord[i][j] = this->mj_coordinates[i][j];
2409  }
2410  this->mj_coordinates = coord;
2411 
2412 
2413  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2414  mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2415 
2416  for (int i=0; i < criteria_dim; i++){
2417  weights[i] = NULL;
2418  }
2419  for (int i=0; i < this->num_weights_per_coord; i++){
2420  weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2421 #ifdef HAVE_ZOLTAN2_OMP
2422 #pragma omp parallel for
2423 #endif
2424  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2425  weights[i][j] = this->mj_weights[i][j];
2426 
2427  }
2428  this->mj_weights = weights;
2429  this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2430 #ifdef HAVE_ZOLTAN2_OMP
2431 #pragma omp parallel for
2432 #endif
2433  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2434  this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2435 
2436  this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2437 
2438 #ifdef HAVE_ZOLTAN2_OMP
2439 #pragma omp parallel for
2440 #endif
2441  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2442  this->owner_of_coordinate[j] = this->myActualRank;
2443 }
2444 
2445 /* \brief compute the global bounding box
2446  */
2447 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2448  typename mj_part_t>
2449 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2450 {
2451  //local min coords
2452  mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2453  //global min coords
2454  mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2455  //local max coords
2456  mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2457  //global max coords
2458  mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2459 
2460  for (int i = 0; i < this->coord_dim; ++i){
2461  mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2462  mj_scalar_t localMax = -localMin;
2463  if (localMax > 0) localMax = 0;
2464 
2465 
2466  for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2467  if (this->mj_coordinates[i][j] < localMin){
2468  localMin = this->mj_coordinates[i][j];
2469  }
2470  if (this->mj_coordinates[i][j] > localMax){
2471  localMax = this->mj_coordinates[i][j];
2472  }
2473  }
2474  //std::cout << " localMin:" << localMin << std::endl;
2475  //std::cout << " localMax:" << localMax << std::endl;
2476  mins[i] = localMin;
2477  maxs[i] = localMax;
2478 
2479  }
2480  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2481  this->coord_dim, mins, gmins
2482  );
2483 
2484 
2485  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2486  this->coord_dim, maxs, gmaxs
2487  );
2488 
2489 
2490 
2491  //create single box with all areas.
2492  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2493  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2494  freeArray<mj_scalar_t>(mins);
2495  freeArray<mj_scalar_t>(gmins);
2496  freeArray<mj_scalar_t>(maxs);
2497  freeArray<mj_scalar_t>(gmaxs);
2498 }
2499 
2500 /* \brief for part communication we keep track of the box boundaries.
2501  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
2502  * This function initializes a single box with all global min and max coordinates.
2503  * \param initial_partitioning_boxes the input and output vector for boxes.
2504  */
2505 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2506  typename mj_part_t>
2507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2508  RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2509 )
2510 {
2511  mj_partBox_t tmp_box(*global_box);
2512  initial_partitioning_boxes->push_back(tmp_box);
2513 }
2514 
2525 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2526  typename mj_part_t>
2527 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2528  mj_lno_t coordinate_begin_index,
2529  mj_lno_t coordinate_end_index,
2530  mj_lno_t *mj_current_coordinate_permutations,
2531  mj_scalar_t *mj_current_dim_coords,
2532  mj_scalar_t &min_coordinate,
2533  mj_scalar_t &max_coordinate,
2534  mj_scalar_t &total_weight){
2535 
2536  //if the part is empty.
2537  //set the min and max coordinates as reverse.
2538  if(coordinate_begin_index >= coordinate_end_index)
2539  {
2540  min_coordinate = this->maxScalar_t;
2541  max_coordinate = this->minScalar_t;
2542  total_weight = 0;
2543  }
2544  else {
2545  mj_scalar_t my_total_weight = 0;
2546 #ifdef HAVE_ZOLTAN2_OMP
2547 #pragma omp parallel num_threads(this->num_threads)
2548 #endif
2549  {
2550  //if uniform weights are used, then weight is equal to count.
2551  if (this->mj_uniform_weights[0]) {
2552 #ifdef HAVE_ZOLTAN2_OMP
2553 #pragma omp single
2554 #endif
2555  {
2556  my_total_weight = coordinate_end_index - coordinate_begin_index;
2557  }
2558 
2559  }
2560  else {
2561  //if not uniform, then weights are reducted from threads.
2562 #ifdef HAVE_ZOLTAN2_OMP
2563 #pragma omp for reduction(+:my_total_weight)
2564 #endif
2565  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2566  int i = mj_current_coordinate_permutations[ii];
2567  my_total_weight += this->mj_weights[0][i];
2568  }
2569  }
2570 
2571  int my_thread_id = 0;
2572 #ifdef HAVE_ZOLTAN2_OMP
2573  my_thread_id = omp_get_thread_num();
2574 #endif
2575  mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2576  my_thread_min_coord=my_thread_max_coord
2577  =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2578 
2579 
2580 #ifdef HAVE_ZOLTAN2_OMP
2581 #pragma omp for
2582 #endif
2583  for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2584  int i = mj_current_coordinate_permutations[j];
2585  if(mj_current_dim_coords[i] > my_thread_max_coord)
2586  my_thread_max_coord = mj_current_dim_coords[i];
2587  if(mj_current_dim_coords[i] < my_thread_min_coord)
2588  my_thread_min_coord = mj_current_dim_coords[i];
2589  }
2590  this->max_min_coords[my_thread_id] = my_thread_min_coord;
2591  this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2592 
2593 #ifdef HAVE_ZOLTAN2_OMP
2594 //we need a barrier here, because max_min_array might not be filled by some of the threads.
2595 #pragma omp barrier
2596 #pragma omp single nowait
2597 #endif
2598  {
2599  min_coordinate = this->max_min_coords[0];
2600  for(int i = 1; i < this->num_threads; ++i){
2601  if(this->max_min_coords[i] < min_coordinate)
2602  min_coordinate = this->max_min_coords[i];
2603  }
2604  }
2605 
2606 #ifdef HAVE_ZOLTAN2_OMP
2607 #pragma omp single nowait
2608 #endif
2609  {
2610  max_coordinate = this->max_min_coords[this->num_threads];
2611  for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2612  if(this->max_min_coords[i] > max_coordinate)
2613  max_coordinate = this->max_min_coords[i];
2614  }
2615  }
2616  }
2617  total_weight = my_total_weight;
2618  }
2619 }
2620 
2621 
2629 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2630  typename mj_part_t>
2631 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2632  mj_part_t current_concurrent_num_parts,
2633  mj_scalar_t *local_min_max_total,
2634  mj_scalar_t *global_min_max_total){
2635 
2636  //reduce min for first current_concurrent_num_parts elements, reduce max for next
2637  //concurrentPartCount elements,
2638  //reduce sum for the last concurrentPartCount elements.
2639  if(this->comm->getSize() > 1){
2641  reductionOp(
2642  current_concurrent_num_parts,
2643  current_concurrent_num_parts,
2644  current_concurrent_num_parts);
2645  try{
2646  reduceAll<int, mj_scalar_t>(
2647  *(this->comm),
2648  reductionOp,
2649  3 * current_concurrent_num_parts,
2650  local_min_max_total,
2651  global_min_max_total);
2652  }
2653  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2654  }
2655  else {
2656  mj_part_t s = 3 * current_concurrent_num_parts;
2657  for (mj_part_t i = 0; i < s; ++i){
2658  global_min_max_total[i] = local_min_max_total[i];
2659  }
2660  }
2661 }
2662 
2663 
2664 
2683 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2684  typename mj_part_t>
2685 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2686  mj_scalar_t min_coord,
2687  mj_scalar_t max_coord,
2688  mj_part_t num_cuts/*p-1*/ ,
2689  mj_scalar_t global_weight,
2690  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
2691  mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
2692 
2693  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
2694  std::vector <mj_part_t> *next_future_num_parts_in_parts,
2695  mj_part_t concurrent_current_part,
2696  mj_part_t obtained_part_index
2697 ){
2698 
2699  mj_scalar_t coord_range = max_coord - min_coord;
2700  if(this->mj_uniform_parts[0]){
2701  {
2702  mj_part_t cumulative = 0;
2703  //how many total future parts the part will be partitioned into.
2704  mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2705 
2706 
2707  //how much each part should weigh in ideal case.
2708  mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2709  /*
2710  std::cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << std::endl;
2711  std::cout << "global_weight:" << global_weight << std::endl;
2712  std::cout << "unit_part_weight" << unit_part_weight << std::endl;
2713  */
2714  for(mj_part_t i = 0; i < num_cuts; ++i){
2715  cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2716 
2717  /*
2718  std::cout << "obtained_part_index:" << obtained_part_index <<
2719  " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] <<
2720  " cumulative:" << cumulative << std::endl;
2721  */
2722  //set target part weight.
2723  current_target_part_weights[i] = cumulative * unit_part_weight;
2724  //std::cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] <<std::endl;
2725  //set initial cut coordinate.
2726 
2727  initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2728  }
2729  current_target_part_weights[num_cuts] = 1;
2730  }
2731 
2732  //round the target part weights.
2733  if (this->mj_uniform_weights[0]){
2734  for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2735 
2736  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2737  }
2738  }
2739  }
2740  else {
2741  std::cerr << "MJ does not support non uniform part weights" << std::endl;
2742  exit(1);
2743  }
2744 }
2745 
2746 
2759 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2760  typename mj_part_t>
2761 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
2762  mj_scalar_t &max_coordinate,
2763  mj_scalar_t &min_coordinate,
2764  mj_part_t &concurrent_current_part_index,
2765  mj_lno_t coordinate_begin_index,
2766  mj_lno_t coordinate_end_index,
2767  mj_lno_t *mj_current_coordinate_permutations,
2768  mj_scalar_t *mj_current_dim_coords,
2769  mj_part_t *mj_part_ids,
2770  mj_part_t &partition_count
2771 ){
2772  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2773 
2774  //if there is single point, or if all points are along a line.
2775  //set initial part to 0 for all.
2776  if(ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp parallel for
2779 #endif
2780  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781  mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2782  }
2783  }
2784  else{
2785 
2786  //otherwise estimate an initial part for each coordinate.
2787  //assuming uniform distribution of points.
2788  mj_scalar_t slice = coordinate_range / partition_count;
2789 
2790 #ifdef HAVE_ZOLTAN2_OMP
2791 #pragma omp parallel for
2792 #endif
2793  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2794 
2795  mj_lno_t iii = mj_current_coordinate_permutations[ii];
2796  mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2797  mj_part_ids[iii] = 2 * pp;
2798  }
2799  }
2800 }
2801 
2802 
2813 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2814  typename mj_part_t>
2815 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
2816  mj_scalar_t *mj_current_dim_coords,
2817  mj_scalar_t used_imbalance_tolerance,
2818  mj_part_t current_work_part,
2819  mj_part_t current_concurrent_num_parts,
2820  mj_scalar_t *current_cut_coordinates,
2821  mj_part_t total_incomplete_cut_count,
2822  std::vector <mj_part_t> &num_partitioning_in_current_dim
2823 ){
2824 
2825 
2826  mj_part_t rectilinear_cut_count = 0;
2827  mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2828 
2830  *reductionOp = NULL;
2831  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
2832  <mj_part_t, mj_scalar_t>(
2833  &num_partitioning_in_current_dim ,
2834  current_work_part ,
2835  current_concurrent_num_parts);
2836 
2837  size_t total_reduction_size = 0;
2838 #ifdef HAVE_ZOLTAN2_OMP
2839 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
2840 #endif
2841  {
2842  int me = 0;
2843 #ifdef HAVE_ZOLTAN2_OMP
2844  me = omp_get_thread_num();
2845 #endif
2846  double *my_thread_part_weights = this->thread_part_weights[me];
2847  mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2848  mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2849 
2850 #ifdef HAVE_ZOLTAN2_OMP
2851 #pragma omp single
2852 #endif
2853  {
2854  //initialize the lower and upper bounds of the cuts.
2855  mj_part_t next = 0;
2856  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2857 
2858  mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2859  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2860  total_reduction_size += (4 * num_cut_in_dim + 1);
2861 
2862  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2863  this->is_cut_line_determined[next] = false;
2864  this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
2865  this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
2866 
2867  this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
2868  this->cut_lower_bound_weights[next] = 0;
2869 
2870  if(this->distribute_points_on_cut_lines){
2871  this->process_cut_line_weight_to_put_left[next] = 0;
2872  }
2873  ++next;
2874  }
2875  }
2876  }
2877 
2878  //no need to have barrier here.
2879  //pragma omp single have implicit barrier.
2880 
2881  int iteration = 0;
2882  while (total_incomplete_cut_count != 0){
2883  iteration += 1;
2884  mj_part_t concurrent_cut_shifts = 0;
2885  size_t total_part_shift = 0;
2886 
2887  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2888  mj_part_t num_parts = -1;
2889  num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2890 
2891  mj_part_t num_cuts = num_parts - 1;
2892  size_t total_part_count = num_parts + size_t (num_cuts) ;
2893  if (this->my_incomplete_cut_count[kk] > 0){
2894 
2895  //although isDone shared, currentDone is private and same for all.
2896  bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2897  double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2898  mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2899  mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2900 
2901  mj_part_t conccurent_current_part = current_work_part + kk;
2902  mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2903  mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2904  mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2905 
2906  mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2907  mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2908 
2909  // compute part weights using existing cuts
2910  this->mj_1D_part_get_thread_part_weights(
2911  total_part_count,
2912  num_cuts,
2913  max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
2914  min_coord,//globalMinMaxTotal[kk]//minScalar,
2915  coordinate_begin_index,
2916  coordinate_end_index,
2917  mj_current_dim_coords,
2918  temp_current_cut_coords,
2919  current_cut_status,
2920  my_current_part_weights,
2921  my_current_left_closest,
2922  my_current_right_closest);
2923 
2924  }
2925 
2926  concurrent_cut_shifts += num_cuts;
2927  total_part_shift += total_part_count;
2928  }
2929 
2930  //sum up the results of threads
2931  this->mj_accumulate_thread_results(
2932  num_partitioning_in_current_dim,
2933  current_work_part,
2934  current_concurrent_num_parts);
2935 
2936  //now sum up the results of mpi processors.
2937 #ifdef HAVE_ZOLTAN2_OMP
2938 #pragma omp single
2939 #endif
2940  {
2941  if(this->comm->getSize() > 1){
2942  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2943  total_reduction_size,
2944  this->total_part_weight_left_right_closests,
2945  this->global_total_part_weight_left_right_closests);
2946 
2947  }
2948  else {
2949  memcpy(
2950  this->global_total_part_weight_left_right_closests,
2951  this->total_part_weight_left_right_closests,
2952  total_reduction_size * sizeof(mj_scalar_t));
2953  }
2954  }
2955 
2956  //how much cut will be shifted for the next part in the concurrent part calculation.
2957  mj_part_t cut_shift = 0;
2958 
2959  //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
2960  size_t tlr_shift = 0;
2961  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2962  mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2963  mj_part_t num_cuts = num_parts - 1;
2964  size_t num_total_part = num_parts + size_t (num_cuts) ;
2965 
2966  //if the cuts of this cut has already been completed.
2967  //nothing to do for this part.
2968  //just update the shift amount and proceed.
2969  if (this->my_incomplete_cut_count[kk] == 0) {
2970  cut_shift += num_cuts;
2971  tlr_shift += (num_total_part + 2 * num_cuts);
2972  continue;
2973  }
2974 
2975  mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2976  mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2977  mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
2978  mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
2979  mj_scalar_t *current_global_part_weights = current_global_tlr;
2980  bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2981 
2982  mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2983  mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2984 
2985  mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2986  mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2987  mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2988  mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2989  mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2990  mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2991  mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2992 
2993  mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2994 
2995  // Now compute the new cut coordinates.
2996  this->mj_get_new_cut_coordinates(
2997  num_total_part,
2998  num_cuts,
2999  max_coordinate,
3000  min_coordinate,
3001  global_total_weight,
3002  used_imbalance_tolerance,
3003  current_global_part_weights,
3004  current_local_part_weights,
3005  current_part_target_weights,
3006  current_cut_line_determined,
3007  temp_cut_coords + cut_shift,
3008  current_cut_upper_bounds,
3009  current_cut_lower_bounds,
3010  current_global_left_closest_points,
3011  current_global_right_closest_points,
3012  current_cut_lower_bound_weights,
3013  current_cut_upper_weights,
3014  this->cut_coordinates_work_array +cut_shift, //new cut coordinates
3015  current_part_cut_line_weight_to_put_left,
3016  &rectilinear_cut_count,
3017  this->my_incomplete_cut_count[kk]);
3018 
3019  cut_shift += num_cuts;
3020  tlr_shift += (num_total_part + 2 * num_cuts);
3021  mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3022 #ifdef HAVE_ZOLTAN2_OMP
3023 #pragma omp single
3024 #endif
3025  {
3026  total_incomplete_cut_count -= iteration_complete_cut_count;
3027  }
3028 
3029  }
3030  { //This unnecessary bracket works around a compiler bug in NVCC when compiling with OpenMP enabled
3031 #ifdef HAVE_ZOLTAN2_OMP
3032 #pragma omp barrier
3033 #pragma omp single
3034 #endif
3035  {
3036  //swap the cut coordinates for next iteration.
3037  mj_scalar_t *t = temp_cut_coords;
3038  temp_cut_coords = this->cut_coordinates_work_array;
3039  this->cut_coordinates_work_array = t;
3040  }
3041  }
3042  }
3043 
3044  //if (myRank == 0)
3045  //std::cout << "iteration:" << iteration << " partition:" << num_partitioning_in_current_dim[current_work_part] << std::endl;
3046  // Needed only if keep_cuts; otherwise can simply swap array pointers
3047  // cutCoordinates and cutCoordinatesWork.
3048  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3049  // computed cuts must be in cutCoordinates.
3050  if (current_cut_coordinates != temp_cut_coords){
3051 #ifdef HAVE_ZOLTAN2_OMP
3052 #pragma omp single
3053 #endif
3054  {
3055  mj_part_t next = 0;
3056  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3057  mj_part_t num_parts = -1;
3058  num_parts = num_partitioning_in_current_dim[current_work_part + i];
3059  mj_part_t num_cuts = num_parts - 1;
3060 
3061  for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3062  current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3063  }
3064  next += num_cuts;
3065  }
3066  }
3067 
3068 #ifdef HAVE_ZOLTAN2_OMP
3069 #pragma omp single
3070 #endif
3071  {
3072  this->cut_coordinates_work_array = temp_cut_coords;
3073  }
3074  }
3075  }
3076  delete reductionOp;
3077 }
3078 
3079 
3099 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3100  typename mj_part_t>
3101 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3102  size_t total_part_count,
3103  mj_part_t num_cuts,
3104  mj_scalar_t max_coord,
3105  mj_scalar_t min_coord,
3106  mj_lno_t coordinate_begin_index,
3107  mj_lno_t coordinate_end_index,
3108  mj_scalar_t *mj_current_dim_coords,
3109  mj_scalar_t *temp_current_cut_coords,
3110  bool *current_cut_status,
3111  double *my_current_part_weights,
3112  mj_scalar_t *my_current_left_closest,
3113  mj_scalar_t *my_current_right_closest){
3114 
3115  // initializations for part weights, left/right closest
3116  for (size_t i = 0; i < total_part_count; ++i){
3117  my_current_part_weights[i] = 0;
3118  }
3119 
3120  //initialize the left and right closest coordinates
3121  //to their max value.
3122  for(mj_part_t i = 0; i < num_cuts; ++i){
3123  my_current_left_closest[i] = min_coord - 1;
3124  my_current_right_closest[i] = max_coord + 1;
3125  }
3126  //mj_lno_t comparison_count = 0;
3127  mj_scalar_t minus_EPSILON = -this->sEpsilon;
3128 #ifdef HAVE_ZOLTAN2_OMP
3129  //no need for the barrier as all threads uses their local memories.
3130  //dont change the static scheduling here, as it is assumed when the new
3131  //partitions are created later.
3132 #pragma omp for
3133 #endif
3134  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3135  int i = this->coordinate_permutations[ii];
3136 
3137  //the accesses to assigned_part_ids are thread safe
3138  //since each coordinate is assigned to only a single thread.
3139  mj_part_t j = this->assigned_part_ids[i] / 2;
3140 
3141  if(j >= num_cuts){
3142  j = num_cuts - 1;
3143  }
3144 
3145  mj_part_t lower_cut_index = 0;
3146  mj_part_t upper_cut_index = num_cuts - 1;
3147 
3148  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3149  bool is_inserted = false;
3150  bool is_on_left_of_cut = false;
3151  bool is_on_right_of_cut = false;
3152  mj_part_t last_compared_part = -1;
3153 
3154  mj_scalar_t coord = mj_current_dim_coords[i];
3155 
3156  while(upper_cut_index >= lower_cut_index)
3157  {
3158  //comparison_count++;
3159  last_compared_part = -1;
3160  is_on_left_of_cut = false;
3161  is_on_right_of_cut = false;
3162  mj_scalar_t cut = temp_current_cut_coords[j];
3163  mj_scalar_t distance_to_cut = coord - cut;
3164  mj_scalar_t abs_distance_to_cut = ZOLTAN2_ABS(distance_to_cut);
3165 
3166  //if it is on the line.
3167  if(abs_distance_to_cut < this->sEpsilon){
3168 
3169  my_current_part_weights[j * 2 + 1] += w;
3170  this->assigned_part_ids[i] = j * 2 + 1;
3171 
3172  //assign left and right closest point to cut as the point is on the cut.
3173  my_current_left_closest[j] = coord;
3174  my_current_right_closest[j] = coord;
3175  //now we need to check if there are other cuts on the same cut coordinate.
3176  //if there are, then we add the weight of the cut to all cuts in the same coordinate.
3177  mj_part_t kk = j + 1;
3178  while(kk < num_cuts){
3179  // Needed when cuts shared the same position
3180  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3181  if(distance_to_cut < this->sEpsilon){
3182  my_current_part_weights[2 * kk + 1] += w;
3183  my_current_left_closest[kk] = coord;
3184  my_current_right_closest[kk] = coord;
3185  kk++;
3186  }
3187  else{
3188  //cut is far away.
3189  //just check the left closest point for the next cut.
3190  if(coord - my_current_left_closest[kk] > this->sEpsilon){
3191  my_current_left_closest[kk] = coord;
3192  }
3193  break;
3194  }
3195  }
3196 
3197 
3198  kk = j - 1;
3199  //continue checking for the cuts on the left if they share the same coordinate.
3200  while(kk >= 0){
3201  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3202  if(distance_to_cut < this->sEpsilon){
3203  my_current_part_weights[2 * kk + 1] += w;
3204  //try to write the partId as the leftmost cut.
3205  this->assigned_part_ids[i] = kk * 2 + 1;
3206  my_current_left_closest[kk] = coord;
3207  my_current_right_closest[kk] = coord;
3208  kk--;
3209  }
3210  else{
3211  //if cut is far away on the left of the point.
3212  //then just compare for right closest point.
3213  if(my_current_right_closest[kk] - coord > this->sEpsilon){
3214  my_current_right_closest[kk] = coord;
3215  }
3216  break;
3217  }
3218  }
3219 
3220  is_inserted = true;
3221  break;
3222  }
3223  else {
3224  //if point is on the left of the cut.
3225  if (distance_to_cut < 0) {
3226  bool _break = false;
3227  if(j > 0){
3228  //check distance to the cut on the left the current cut compared.
3229  //if point is on the right, then we find the part of the point.
3230  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3231  if(distance_to_next_cut > this->sEpsilon){
3232  _break = true;
3233  }
3234  }
3235  //if point is not on the right of the next cut, then
3236  //set the upper bound to this cut.
3237  upper_cut_index = j - 1;
3238  //set the last part, and mark it as on the left of the last part.
3239  is_on_left_of_cut = true;
3240  last_compared_part = j;
3241  if(_break) break;
3242  }
3243  else {
3244  //if point is on the right of the cut.
3245  bool _break = false;
3246  if(j < num_cuts - 1){
3247  //check distance to the cut on the left the current cut compared.
3248  //if point is on the right, then we find the part of the point.
3249  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3250  if(distance_to_next_cut < minus_EPSILON){
3251  _break = true;
3252  }
3253  }
3254 
3255  //if point is not on the left of the next cut, then
3256  //set the upper bound to this cut.
3257  lower_cut_index = j + 1;
3258  //set the last part, and mark it as on the right of the last part.
3259  is_on_right_of_cut = true;
3260  last_compared_part = j;
3261  if(_break) break;
3262  }
3263  }
3264 
3265  j = (upper_cut_index + lower_cut_index) / 2;
3266  }
3267  if(!is_inserted){
3268  if(is_on_right_of_cut){
3269 
3270  //add it to the right of the last compared part.
3271  my_current_part_weights[2 * last_compared_part + 2] += w;
3272  this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3273 
3274  //update the right closest point of last compared cut.
3275  if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3276  my_current_right_closest[last_compared_part] = coord;
3277  }
3278  //update the left closest point of the cut on the right of the last compared cut.
3279  if(last_compared_part+1 < num_cuts){
3280 
3281  if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3282  my_current_left_closest[last_compared_part + 1] = coord;
3283  }
3284  }
3285 
3286  }
3287  else if(is_on_left_of_cut){
3288 
3289  //add it to the left of the last compared part.
3290  my_current_part_weights[2 * last_compared_part] += w;
3291  this->assigned_part_ids[i] = 2 * last_compared_part;
3292 
3293 
3294  //update the left closest point of last compared cut.
3295  if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3296  my_current_left_closest[last_compared_part] = coord;
3297  }
3298 
3299  //update the right closest point of the cut on the left of the last compared cut.
3300  if(last_compared_part-1 >= 0){
3301  if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3302  my_current_right_closest[last_compared_part -1] = coord;
3303  }
3304  }
3305  }
3306  }
3307  }
3308 
3309  // prefix sum computation.
3310  //we need prefix sum for each part to determine cut positions.
3311  for (size_t i = 1; i < total_part_count; ++i){
3312  // check for cuts sharing the same position; all cuts sharing a position
3313  // have the same weight == total weight for all cuts sharing the position.
3314  // don't want to accumulate that total weight more than once.
3315  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3316  ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3317  < this->sEpsilon){
3318  //i % 2 = 0 when part i represents the cut coordinate.
3319  //if it is a cut, and if the next cut also have the same coordinate, then
3320  //dont addup.
3321  my_current_part_weights[i] = my_current_part_weights[i-2];
3322  continue;
3323  }
3324  //otherwise do the prefix sum.
3325  my_current_part_weights[i] += my_current_part_weights[i-1];
3326  }
3327 }
3328 
3329 
3337 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3338  typename mj_part_t>
3339 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3340  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3341  mj_part_t current_work_part,
3342  mj_part_t current_concurrent_num_parts){
3343 
3344 #ifdef HAVE_ZOLTAN2_OMP
3345  //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
3346  //using parallel region here reduces the performance because of the cache invalidates.
3347 #pragma omp barrier
3348 #pragma omp single
3349 #endif
3350  {
3351  size_t tlr_array_shift = 0;
3352  mj_part_t cut_shift = 0;
3353 
3354  //iterate for all concurrent parts to find the left and right closest points in the process.
3355  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3356 
3357  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3358  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3359  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3360 
3361  //iterate for cuts in a single part.
3362  for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3363  mj_part_t next = tlr_array_shift + ii;
3364  mj_part_t cut_index = cut_shift + ii;
3365  if(this->is_cut_line_determined[cut_index]) continue;
3366  mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3367  right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3368 
3369  //find the closest points from left and right for the cut in the process.
3370  for (int j = 1; j < this->num_threads; ++j){
3371  if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3372  right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3373  }
3374  if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3375  left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3376  }
3377  }
3378  //store the left and right closes points.
3379  this->total_part_weight_left_right_closests[num_total_part_in_part +
3380  next] = left_closest_in_process;
3381  this->total_part_weight_left_right_closests[num_total_part_in_part +
3382  num_cuts_in_part + next] = right_closest_in_process;
3383  }
3384  //set the shift position in the arrays
3385  tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3386  cut_shift += num_cuts_in_part;
3387  }
3388 
3389  tlr_array_shift = 0;
3390  cut_shift = 0;
3391  size_t total_part_array_shift = 0;
3392 
3393  //iterate for all concurrent parts to find the total weight in the process.
3394  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3395 
3396  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3397  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3398  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3399 
3400  for(size_t j = 0; j < num_total_part_in_part; ++j){
3401 
3402  mj_part_t cut_ind = j / 2 + cut_shift;
3403 
3404  //need to check j != num_total_part_in_part - 1
3405  // which is same as j/2 != num_cuts_in_part.
3406  //we cannot check it using cut_ind, because of the concurrent part concantanetion.
3407  if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
3408  double pwj = 0;
3409  for (int k = 0; k < this->num_threads; ++k){
3410  pwj += this->thread_part_weights[k][total_part_array_shift + j];
3411  }
3412  //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
3413  this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3414  }
3415  cut_shift += num_cuts_in_part;
3416  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3417  total_part_array_shift += num_total_part_in_part;
3418  }
3419  }
3420  //the other threads needs to wait here.
3421  //but we don't need a pragma omp barrier.
3422  //as omp single has already have implicit barrier.
3423 }
3424 
3425 
3435 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3436  typename mj_part_t>
3437 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3438  mj_scalar_t cut_upper_bound,
3439  mj_scalar_t cut_lower_bound,
3440  mj_scalar_t cut_upper_weight,
3441  mj_scalar_t cut_lower_weight,
3442  mj_scalar_t expected_weight,
3443  mj_scalar_t &new_cut_position){
3444 
3445  if(ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3446  new_cut_position = cut_upper_bound; //or lower bound does not matter.
3447  }
3448 
3449 
3450  if(ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3451  new_cut_position = cut_lower_bound;
3452  }
3453 
3454  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3455  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3456  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3457 
3458  mj_scalar_t required_shift = (my_weight_diff / weight_range);
3459  int scale_constant = 20;
3460  int shiftint= int (required_shift * scale_constant);
3461  if (shiftint == 0) shiftint = 1;
3462  required_shift = mj_scalar_t (shiftint) / scale_constant;
3463  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3464 }
3465 
3466 
3477 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3478  typename mj_part_t>
3479 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3480  mj_part_t num_parts,
3481  mj_scalar_t *mj_current_dim_coords,
3482  mj_scalar_t *current_concurrent_cut_coordinate,
3483  mj_lno_t coordinate_begin,
3484  mj_lno_t coordinate_end,
3485  mj_scalar_t *used_local_cut_line_weight_to_left,
3486  double **used_thread_part_weight_work,
3487  mj_lno_t *out_part_xadj){
3488 
3489  mj_part_t num_cuts = num_parts - 1;
3490 
3491 #ifdef HAVE_ZOLTAN2_OMP
3492 #pragma omp parallel
3493 #endif
3494  {
3495  int me = 0;
3496 #ifdef HAVE_ZOLTAN2_OMP
3497  me = omp_get_thread_num();
3498 #endif
3499 
3500  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3501  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3502 
3503  //now if the rectilinear partitioning is allowed we decide how
3504  //much weight each thread should put to left and right.
3505  if (this->distribute_points_on_cut_lines){
3506  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3507  // this for assumes the static scheduling in mj_1D_part calculation.
3508 #ifdef HAVE_ZOLTAN2_OMP
3509 #pragma omp for
3510 #endif
3511  for (mj_part_t i = 0; i < num_cuts; ++i){
3512  //the left to be put on the left of the cut.
3513  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3514  for(int ii = 0; ii < this->num_threads; ++ii){
3515  if(left_weight > this->sEpsilon){
3516  //the weight of thread ii on cut.
3517  mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3518  if(thread_ii_weight_on_cut < left_weight){
3519  //if left weight is bigger than threads weight on cut.
3520  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3521  }
3522  else {
3523  //if thread's weight is bigger than space, then put only a portion.
3524  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3525  }
3526  left_weight -= thread_ii_weight_on_cut;
3527  }
3528  else {
3529  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3530  }
3531  }
3532  }
3533 
3534  if(num_cuts > 0){
3535  //this is a special case. If cutlines share the same coordinate, their weights are equal.
3536  //we need to adjust the ratio for that.
3537  for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3538  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3539  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3540  }
3541  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
3542  / mj_scalar_t(SIGNIFICANCE_MUL);
3543  }
3544  }
3545  }
3546 
3547  for(mj_part_t ii = 0; ii < num_parts; ++ii){
3548  thread_num_points_in_parts[ii] = 0;
3549  }
3550 
3551 
3552 #ifdef HAVE_ZOLTAN2_OMP
3553  //dont change static scheduler. the static partitioner used later as well.
3554 #pragma omp for
3555 #endif
3556  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3557 
3558  mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3559  mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3560  mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3561  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3562  if(coordinate_assigned_place % 2 == 1){
3563  //if it is on the cut.
3564  if(this->distribute_points_on_cut_lines
3565  && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3566  //if the rectilinear partitioning is allowed,
3567  //and the thread has still space to put on the left of the cut
3568  //then thread puts the vertex to left.
3569  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3570  //if putting the vertex to left increased the weight more than expected.
3571  //and if the next cut is on the same coordinate,
3572  //then we need to adjust how much weight next cut puts to its left as well,
3573  //in order to take care of the imbalance.
3574  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3575  && coordinate_assigned_part < num_cuts - 1
3576  && ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3577  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3578  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3579  }
3580  ++thread_num_points_in_parts[coordinate_assigned_part];
3581  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3582  }
3583  else{
3584  //if there is no more space on the left, put the coordinate to the right of the cut.
3585  ++coordinate_assigned_part;
3586  //this while loop is necessary when a line is partitioned into more than 2 parts.
3587  while(this->distribute_points_on_cut_lines &&
3588  coordinate_assigned_part < num_cuts){
3589  //traverse all the cut lines having the same partitiong
3590  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3591  current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3592  < this->sEpsilon){
3593  //if line has enough space on left, put it there.
3594  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3595  this->sEpsilon &&
3596  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3597  ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3598  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3599  //Again if it put too much on left of the cut,
3600  //update how much the next cut sharing the same coordinate will put to its left.
3601  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3602  coordinate_assigned_part < num_cuts - 1 &&
3603  ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3604  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3605  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3606  }
3607  break;
3608  }
3609  }
3610  else {
3611  break;
3612  }
3613  ++coordinate_assigned_part;
3614  }
3615  ++thread_num_points_in_parts[coordinate_assigned_part];
3616  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3617  }
3618  }
3619  else {
3620  //if it is already assigned to a part, then just put it to the corresponding part.
3621  ++thread_num_points_in_parts[coordinate_assigned_part];
3622  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3623  }
3624  }
3625 
3626 
3627 
3628  //now we calculate where each thread will write in new_coordinate_permutations array.
3629  //first we find the out_part_xadj, by marking the begin and end points of each part found.
3630  //the below loop find the number of points in each part, and writes it to out_part_xadj
3631 #ifdef HAVE_ZOLTAN2_OMP
3632 #pragma omp for
3633 #endif
3634  for(mj_part_t j = 0; j < num_parts; ++j){
3635  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3636  for (int i = 0; i < this->num_threads; ++i){
3637  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3638  //prefix sum to thread point counts, so that each will have private space to write.
3639  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3640  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3641 
3642  }
3643  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
3644  }
3645 
3646  //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
3647 #ifdef HAVE_ZOLTAN2_OMP
3648 #pragma omp single
3649 #endif
3650  {
3651  //perform prefix sum for num_points in parts.
3652  for(mj_part_t j = 1; j < num_parts; ++j){
3653  out_part_xadj[j] += out_part_xadj[j - 1];
3654  }
3655  }
3656 
3657  //shift the num points in threads thread to obtain the
3658  //beginning index of each thread's private space.
3659  for(mj_part_t j = 1; j < num_parts; ++j){
3660  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3661  }
3662 
3663 
3664  //now thread gets the coordinate and writes the index of coordinate to the permutation array
3665  //using the part index we calculated.
3666 #ifdef HAVE_ZOLTAN2_OMP
3667 #pragma omp for
3668 #endif
3669  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3670  mj_lno_t i = this->coordinate_permutations[ii];
3671  mj_part_t p = this->assigned_part_ids[i];
3672  this->new_coordinate_permutations[coordinate_begin +
3673  thread_num_points_in_parts[p]++] = i;
3674  }
3675  }
3676 }
3677 
3678 
3679 
3708 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3709  typename mj_part_t>
3710 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3711  const size_t &num_total_part,
3712  const mj_part_t &num_cuts,
3713  const mj_scalar_t &max_coordinate,
3714  const mj_scalar_t &min_coordinate,
3715  const mj_scalar_t &global_total_weight,
3716  const mj_scalar_t &used_imbalance_tolerance,
3717  mj_scalar_t * current_global_part_weights,
3718  const mj_scalar_t * current_local_part_weights,
3719  const mj_scalar_t *current_part_target_weights,
3720  bool *current_cut_line_determined,
3721  mj_scalar_t *current_cut_coordinates,
3722  mj_scalar_t *current_cut_upper_bounds,
3723  mj_scalar_t *current_cut_lower_bounds,
3724  mj_scalar_t *current_global_left_closest_points,
3725  mj_scalar_t *current_global_right_closest_points,
3726  mj_scalar_t * current_cut_lower_bound_weights,
3727  mj_scalar_t * current_cut_upper_weights,
3728  mj_scalar_t *new_current_cut_coordinates,
3729  mj_scalar_t *current_part_cut_line_weight_to_put_left,
3730  mj_part_t *rectilinear_cut_count,
3731  mj_part_t &my_num_incomplete_cut){
3732 
3733  //seen weight in the part
3734  mj_scalar_t seen_weight_in_part = 0;
3735  //expected weight for part.
3736  mj_scalar_t expected_weight_in_part = 0;
3737  //imbalance for the left and right side of the cut.
3738  mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3739 
3740 
3741 #ifdef HAVE_ZOLTAN2_OMP
3742 #pragma omp for
3743 #endif
3744  for (mj_part_t i = 0; i < num_cuts; i++){
3745  //if left and right closest points are not set yet,
3746  //set it to the cut itself.
3747  if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3748  current_global_left_closest_points[i] = current_cut_coordinates[i];
3749  if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3750  current_global_right_closest_points[i] = current_cut_coordinates[i];
3751 
3752  }
3753 #ifdef HAVE_ZOLTAN2_OMP
3754 #pragma omp for
3755 #endif
3756  for (mj_part_t i = 0; i < num_cuts; i++){
3757 
3758  if(this->distribute_points_on_cut_lines){
3759  //init the weight on the cut.
3760  this->global_rectilinear_cut_weight[i] = 0;
3761  this->process_rectilinear_cut_weight[i] = 0;
3762  }
3763  //if already determined at previous iterations,
3764  //then just write the coordinate to new array, and proceed.
3765  if(current_cut_line_determined[i]) {
3766  new_current_cut_coordinates[i] = current_cut_coordinates[i];
3767  continue;
3768  }
3769 
3770  //current weight of the part at the left of the cut line.
3771  seen_weight_in_part = current_global_part_weights[i * 2];
3772 
3773  /*
3774  std::cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part <<std::endl;
3775  std::cout << "\tcut:" << current_cut_coordinates[i]
3776  << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
3777  << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << std::endl;
3778  */
3779  //expected ratio
3780  expected_weight_in_part = current_part_target_weights[i];
3781  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
3782  imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3783  //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
3784  imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3785 
3786  bool is_left_imbalance_valid = ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3787  bool is_right_imbalance_valid = ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3788 
3789  //if the cut line reaches to desired imbalance.
3790  if(is_left_imbalance_valid && is_right_imbalance_valid){
3791  current_cut_line_determined[i] = true;
3792 #ifdef HAVE_ZOLTAN2_OMP
3793 #pragma omp atomic
3794 #endif
3795  my_num_incomplete_cut -= 1;
3796  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3797  continue;
3798  }
3799  else if(imbalance_on_left < 0){
3800  //if left imbalance < 0 then we need to move the cut to right.
3801 
3802  if(this->distribute_points_on_cut_lines){
3803  //if it is okay to distribute the coordinate on
3804  //the same coordinate to left and right.
3805  //then check if we can reach to the target weight by including the
3806  //coordinates in the part.
3807  if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3808  //if it is we are done.
3809  current_cut_line_determined[i] = true;
3810 #ifdef HAVE_ZOLTAN2_OMP
3811 #pragma omp atomic
3812 #endif
3813  my_num_incomplete_cut -= 1;
3814 
3815  //then assign everything on the cut to the left of the cut.
3816  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3817 
3818  //for this cut all the weight on cut will be put to left.
3819 
3820  current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3821  continue;
3822  }
3823  else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3824 
3825  //if the weight is larger than the expected weight,
3826  //then we need to distribute some points to left, some to right.
3827  current_cut_line_determined[i] = true;
3828 #ifdef HAVE_ZOLTAN2_OMP
3829 #pragma omp atomic
3830 #endif
3831  *rectilinear_cut_count += 1;
3832  //increase the num cuts to be determined with rectilinear partitioning.
3833 
3834 #ifdef HAVE_ZOLTAN2_OMP
3835 #pragma omp atomic
3836 #endif
3837  my_num_incomplete_cut -= 1;
3838  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3839  this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3840  current_local_part_weights[i * 2];
3841  continue;
3842  }
3843  }
3844  //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
3845  current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3846  //set the lower bound weight to the weight we have seen.
3847  current_cut_lower_bound_weights[i] = seen_weight_in_part;
3848 
3849  //compare the upper bound with what has been found in the last iteration.
3850  //we try to make more strict bounds for the cut here.
3851  for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3852  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3853  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3854 
3855  if(p_weight >= expected_weight_in_part){
3856  //if a cut on the right has the expected weight, then we found
3857  //our cut position. Set up and low coordiantes to this new cut coordinate.
3858  //but we need one more iteration to finalize the cut position,
3859  //as wee need to update the part ids.
3860  if(p_weight == expected_weight_in_part){
3861  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3862  current_cut_upper_weights[i] = p_weight;
3863  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3864  current_cut_lower_bound_weights[i] = p_weight;
3865  } else if (p_weight < current_cut_upper_weights[i]){
3866  //if a part weight is larger then my expected weight,
3867  //but lower than my upper bound weight, update upper bound.
3868  current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3869  current_cut_upper_weights[i] = p_weight;
3870  }
3871  break;
3872  }
3873  //if comes here then pw < ew
3874  //then compare the weight against line weight.
3875  if(line_weight >= expected_weight_in_part){
3876  //if the line is larger than the expected weight,
3877  //then we need to reach to the balance by distributing coordinates on this line.
3878  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3879  current_cut_upper_weights[i] = line_weight;
3880  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3881  current_cut_lower_bound_weights[i] = p_weight;
3882  break;
3883  }
3884  //if a stricter lower bound is found,
3885  //update the lower bound.
3886  if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3887  current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3888  current_cut_lower_bound_weights[i] = p_weight;
3889  }
3890  }
3891 
3892 
3893  mj_scalar_t new_cut_position = 0;
3894  this->mj_calculate_new_cut_position(
3895  current_cut_upper_bounds[i],
3896  current_cut_lower_bounds[i],
3897  current_cut_upper_weights[i],
3898  current_cut_lower_bound_weights[i],
3899  expected_weight_in_part, new_cut_position);
3900 
3901  //if cut line does not move significantly.
3902  //then finalize the search.
3903  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3904  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
3905  ){
3906  current_cut_line_determined[i] = true;
3907 #ifdef HAVE_ZOLTAN2_OMP
3908 #pragma omp atomic
3909 #endif
3910  my_num_incomplete_cut -= 1;
3911 
3912  //set the cut coordinate and proceed.
3913  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3914  } else {
3915  new_current_cut_coordinates [i] = new_cut_position;
3916  }
3917  } else {
3918 
3919  //need to move the cut line to left.
3920  //set upper bound to current line.
3921  current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3922  current_cut_upper_weights[i] = seen_weight_in_part;
3923 
3924  // compare the current cut line weights with previous upper and lower bounds.
3925  for (int ii = i - 1; ii >= 0; --ii){
3926  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3927  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3928  if(p_weight <= expected_weight_in_part){
3929  if(p_weight == expected_weight_in_part){
3930  //if the weight of the part is my expected weight
3931  //then we find the solution.
3932  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3933  current_cut_upper_weights[i] = p_weight;
3934  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3935  current_cut_lower_bound_weights[i] = p_weight;
3936  }
3937  else if (p_weight > current_cut_lower_bound_weights[i]){
3938  //if found weight is bigger than the lower bound
3939  //then update the lower bound.
3940  current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3941  current_cut_lower_bound_weights[i] = p_weight;
3942 
3943  //at the same time, if weight of line is bigger than the
3944  //expected weight, then update the upper bound as well.
3945  //in this case the balance will be obtained by distributing weightss
3946  //on this cut position.
3947  if(line_weight > expected_weight_in_part){
3948  current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3949  current_cut_upper_weights[i] = line_weight;
3950  }
3951  }
3952  break;
3953  }
3954  //if the weight of the cut on the left is still bigger than my weight,
3955  //and also if the weight is smaller than the current upper weight,
3956  //or if the weight is equal to current upper weight, but on the left of
3957  // the upper weight, then update upper bound.
3958  if (p_weight >= expected_weight_in_part &&
3959  (p_weight < current_cut_upper_weights[i] ||
3960  (p_weight == current_cut_upper_weights[i] &&
3961  current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3962  )
3963  )
3964  ){
3965  current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3966  current_cut_upper_weights[i] = p_weight;
3967  }
3968  }
3969  mj_scalar_t new_cut_position = 0;
3970  this->mj_calculate_new_cut_position(
3971  current_cut_upper_bounds[i],
3972  current_cut_lower_bounds[i],
3973  current_cut_upper_weights[i],
3974  current_cut_lower_bound_weights[i],
3975  expected_weight_in_part,
3976  new_cut_position);
3977 
3978  //if cut line does not move significantly.
3979  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3980  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
3981  current_cut_line_determined[i] = true;
3982 #ifdef HAVE_ZOLTAN2_OMP
3983 #pragma omp atomic
3984 #endif
3985  my_num_incomplete_cut -= 1;
3986  //set the cut coordinate and proceed.
3987  new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3988  } else {
3989  new_current_cut_coordinates [ i] = new_cut_position;
3990  }
3991  }
3992  }
3993 
3994  { // This unnecessary bracket works around a compiler bug in NVCC when enabling OpenMP as well
3995 
3996  //communication to determine the ratios of processors for the distribution
3997  //of coordinates on the cut lines.
3998 #ifdef HAVE_ZOLTAN2_OMP
3999  //no need barrier here as it is implicit.
4000 #pragma omp single
4001 #endif
4002  {
4003  if(*rectilinear_cut_count > 0){
4004 
4005  try{
4006  Teuchos::scan<int,mj_scalar_t>(
4007  *comm, Teuchos::REDUCE_SUM,
4008  num_cuts,
4009  this->process_rectilinear_cut_weight,
4010  this->global_rectilinear_cut_weight
4011  );
4012  }
4013  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4014 
4015  for (mj_part_t i = 0; i < num_cuts; ++i){
4016  //if cut line weight to be distributed.
4017  if(this->global_rectilinear_cut_weight[i] > 0) {
4018  //expected weight to go to left of the cut.
4019  mj_scalar_t expected_part_weight = current_part_target_weights[i];
4020  //the weight that should be put to left of the cut.
4021  mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4022  //the weight of the cut in the process
4023  mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4024  //the sum of the cut weights upto this process, including the weight of this process.
4025  mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4026  //the space on the left side of the cut after all processes before this process (including this process)
4027  //puts their weights on cut to left.
4028  mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4029  //add my weight to this space to find out how much space is left to me.
4030  mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4031 
4032  /*
4033  std::cout << "expected_part_weight:" << expected_part_weight
4034  << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
4035  << " my_weight_on_line" << my_weight_on_line
4036  << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
4037  << " space_to_put_left:" << space_to_put_left
4038  << " space_left_to_me" << space_left_to_me << std::endl;
4039  */
4040  if(space_left_to_me < 0){
4041  //space_left_to_me is negative and i dont need to put anything to left.
4042  current_part_cut_line_weight_to_put_left[i] = 0;
4043  }
4044  else if(space_left_to_me >= my_weight_on_line){
4045  //space left to me is bigger than the weight of the processor on cut.
4046  //so put everything to left.
4047  current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4048  //std::cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << std::endl;
4049  }
4050  else {
4051  //put only the weight as much as the space.
4052  current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4053 
4054  //std::cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << std::endl;
4055  }
4056 
4057  }
4058  }
4059  *rectilinear_cut_count = 0;
4060  }
4061  }
4062  }
4063 }
4064 
4074 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4075  typename mj_part_t>
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4077  mj_part_t num_procs,
4078  mj_part_t num_parts,
4079  mj_gno_t *&num_points_in_all_processor_parts){
4080 
4081  //initially allocation_size is num_parts
4082  size_t allocation_size = num_parts * (num_procs + 1);
4083 
4084  //this will be output
4085  //holds how many each processor has in each part.
4086  //last portion is the sum of all processor points in each part.
4087 
4088  //allocate memory for the local num coordinates in each part.
4089  mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4090 
4091 
4092  //this is the portion of the memory which will be used
4093  //at the summation to obtain total number of processors' points in each part.
4094  mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4095  //this is the portion of the memory where each stores its local number.
4096  //this information is needed by other processors.
4097  mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4098 
4099  //initialize the array with 0's.
4100  memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
4101 
4102  //write the number of coordinates in each part.
4103  for (mj_part_t i = 0; i < num_parts; ++i){
4104  mj_lno_t part_begin_index = 0;
4105  if (i > 0){
4106  part_begin_index = this->new_part_xadj[i - 1];
4107  }
4108  mj_lno_t part_end_index = this->new_part_xadj[i];
4109  my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4110  }
4111 
4112  //copy the local num parts to the last portion of array,
4113  //so that this portion will represent the global num points in each part after the reduction.
4114  memcpy (my_local_point_counts_in_each_art,
4115  my_local_points_to_reduce_sum,
4116  sizeof(mj_gno_t) * (num_parts) );
4117 
4118 
4119  //reduceAll operation.
4120  //the portion that belongs to a processor with index p
4121  //will start from myRank * num_parts.
4122  //the global number of points will be held at the index
4123  try{
4124  reduceAll<int, mj_gno_t>(
4125  *(this->comm),
4126  Teuchos::REDUCE_SUM,
4127  allocation_size,
4128  num_local_points_in_each_part_to_reduce_sum,
4129  num_points_in_all_processor_parts);
4130  }
4131  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4132  freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4133 }
4134 
4135 
4136 
4149 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4150  typename mj_part_t>
4151 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4152  size_t migration_reduce_all_population,
4153  mj_lno_t num_coords_for_last_dim_part,
4154  mj_part_t num_procs,
4155  mj_part_t num_parts,
4156  mj_gno_t *num_points_in_all_processor_parts){
4157 
4158  //if reduce all count and population in the last dim is too high
4159  if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
4160  //if the work in a part per processor in the last dim is too low.
4161  if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
4162 
4163  //if migration is to be checked and the imbalance is too high
4164  if (this->check_migrate_avoid_migration_option == 0){
4165  double global_imbalance = 0;
4166  //global shift to reach the sum of coordiante count in each part.
4167  size_t global_shift = num_procs * num_parts;
4168 
4169  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4170  for (mj_part_t i = 0; i < num_parts; ++i){
4171  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4172  / double(num_procs);
4173 
4174  global_imbalance += ZOLTAN2_ABS(ideal_num -
4175  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4176  }
4177  }
4178  global_imbalance /= num_parts;
4179  global_imbalance /= num_procs;
4180 
4181  /*
4182  if (this->myRank == 0) {
4183  std::cout << "imbalance for next iteration:" << global_imbalance << std::endl;
4184  }
4185  */
4186 
4187  if(global_imbalance <= this->minimum_migration_imbalance){
4188  return false;
4189  }
4190  else {
4191  return true;
4192  }
4193  }
4194  else {
4195  //if migration is forced
4196  return true;
4197  }
4198 }
4199 
4200 
4210 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4211  typename mj_part_t>
4212 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4213  mj_part_t num_parts,
4214  mj_part_t *part_assignment_proc_begin_indices,
4215  mj_part_t *processor_chains_in_parts,
4216  mj_lno_t *send_count_to_each_proc,
4217  int *coordinate_destinations){
4218 
4219  for (mj_part_t p = 0; p < num_parts; ++p){
4220  mj_lno_t part_begin = 0;
4221  if (p > 0) part_begin = this->new_part_xadj[p - 1];
4222  mj_lno_t part_end = this->new_part_xadj[p];
4223 
4224  //get the first part that current processor will send its part-p.
4225  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4226  //initialize how many point I sent to this processor.
4227  mj_lno_t num_total_send = 0;
4228  for (mj_lno_t j=part_begin; j < part_end; j++){
4229  mj_lno_t local_ind = this->new_coordinate_permutations[j];
4230  while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4231  //then get the next processor to send the points in part p.
4232  num_total_send = 0;
4233  //assign new processor to part_assign_begin[p]
4234  part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4235  //remove the previous processor
4236  processor_chains_in_parts[proc_to_sent] = -1;
4237  //choose the next processor as the next one to send.
4238  proc_to_sent = part_assignment_proc_begin_indices[p];
4239  }
4240  //write the gno index to corresponding position in sendBuf.
4241  coordinate_destinations[local_ind] = proc_to_sent;
4242  ++num_total_send;
4243  }
4244  }
4245 }
4246 
4261 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4262  typename mj_part_t>
4263 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4264  mj_gno_t * num_points_in_all_processor_parts,
4265  mj_part_t num_parts,
4266  mj_part_t num_procs,
4267  mj_lno_t *send_count_to_each_proc,
4268  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4269  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4270  mj_part_t &out_part_index,
4271  mj_part_t &output_part_numbering_begin_index,
4272  int *coordinate_destinations){
4273 
4274 
4275  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4276  mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4277 
4278  //boolean variable if the process finds its part to be assigned.
4279  bool did_i_find_my_group = false;
4280 
4281  mj_part_t num_free_procs = num_procs;
4282  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4283 
4284  double max_imbalance_difference = 0;
4285  mj_part_t max_differing_part = 0;
4286 
4287  //find how many processor each part requires.
4288  for (mj_part_t i=0; i < num_parts; i++){
4289 
4290  //scalar portion of the required processors
4291  double scalar_required_proc = num_procs *
4292  (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4293 
4294  //round it to closest integer.
4295  mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
4296 
4297  //if assigning the required num procs, creates problems for the rest of the parts.
4298  //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
4299  if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4300  required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4301  }
4302 
4303  //reduce the free processor count
4304  num_free_procs -= required_proc;
4305  //reduce the free minimum processor count required for the rest of the part by 1.
4306  --minimum_num_procs_required_for_rest_of_parts;
4307 
4308  //part (i) is assigned to (required_proc) processors.
4309  num_procs_assigned_to_each_part[i] = required_proc;
4310 
4311  //because of the roundings some processors might be left as unassigned.
4312  //we want to assign those processors to the part with most imbalance.
4313  //find the part with the maximum imbalance here.
4314  double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4315  if (imbalance_wrt_ideal > max_imbalance_difference){
4316  max_imbalance_difference = imbalance_wrt_ideal;
4317  max_differing_part = i;
4318  }
4319  }
4320 
4321  //assign extra processors to the part with maximum imbalance than the ideal.
4322  if (num_free_procs > 0){
4323  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4324  }
4325 
4326  //now find what are the best processors with least migration for each part.
4327 
4328  //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
4329  //index of a processor that processor sends its data for part - i
4330  mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4331  //the next processor send is found in processor_chains_in_parts, in linked list manner.
4332  mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4333  mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4334 
4335  //initialize the assignment of each processor.
4336  //this has a linked list implementation.
4337  //the beginning of processors assigned
4338  //to each part is hold at part_assignment_proc_begin_indices[part].
4339  //then the next processor assigned to that part is located at
4340  //proc_part_assignments[part_assign_begins[part]], this is a chain
4341  //until the value of -1 is reached.
4342  for (int i = 0; i < num_procs; ++i ){
4343  processor_part_assignments[i] = -1;
4344  processor_chains_in_parts[i] = -1;
4345  }
4346  for (int i = 0; i < num_parts; ++i ){
4347  part_assignment_proc_begin_indices[i] = -1;
4348  }
4349 
4350 
4351  //std::cout << "Before migration: mig type:" << this->migration_type << std::endl;
4352  //Allocate memory for sorting data structure.
4353  uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4354  for(mj_part_t i = 0; i < num_parts; ++i){
4355  //the algorithm tries to minimize the cost of migration,
4356  //by assigning the processors with highest number of coordinates on that part.
4357  //here we might want to implement a maximum weighted bipartite matching algorithm.
4358  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4359  sort_item_num_part_points_in_procs[ii].id = ii;
4360  //if processor is not assigned yet.
4361  //add its num points to the sort data structure.
4362  if (processor_part_assignments[ii] == -1){
4363  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4364  sort_item_num_part_points_in_procs[ii].signbit = 1; //indicate that the processor has positive weight.
4365  }
4366  else {
4367  //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
4368  //would be same if we simply set it to -1,
4369  //but more information with no extra cost (which is used later) is provided.
4370  //sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4371 
4372  //UPDATE: Since above gets warning when unsigned is used to represent, we added extra bit to as sign bit to the sort item.
4373  //It is 1 for positives, 0 for negatives.
4374  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4375  sort_item_num_part_points_in_procs[ii].signbit = 0;
4376  }
4377  }
4378  //sort the processors in the part.
4379  uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4380 
4381  /*
4382  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4383  std::cout << "ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4384  " " << sort_item_num_part_points_in_procs[ii].val <<
4385  " " << int(sort_item_num_part_points_in_procs[ii].signbit) << std::endl;
4386  }
4387  */
4388 
4389  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4390  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4391  mj_gno_t ideal_num_points_in_a_proc =
4392  Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
4393 
4394  //starts sending to least heaviest part.
4395  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4396  mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4397  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4398 
4399  //find the processors that will be assigned to this part, which are the heaviest
4400  //non assigned processors.
4401  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4402  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4403  //assign processor to part - i.
4404  processor_part_assignments[proc_id] = i;
4405  }
4406 
4407  bool did_change_sign = false;
4408  //if processor has a minus count, reverse it.
4409  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4410  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
4411  // TODO: SEE BUG 6194
4412  if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4413  did_change_sign = true;
4414  sort_item_num_part_points_in_procs[ii].signbit = 1;
4415  }
4416  else {
4417  break;
4418  }
4419  }
4420  if(did_change_sign){
4421  //resort the processors in the part for the rest of the processors that is not assigned.
4422  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4423  }
4424  /*
4425  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4426  std::cout << "after resort ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4427  " " << sort_item_num_part_points_in_procs[ii].val <<
4428  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) << std::endl;
4429  }
4430  */
4431 
4432  //check if this processors is one of the procs assigned to this part.
4433  //if it is, then get the group.
4434  if (!did_i_find_my_group){
4435  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4436 
4437  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4438  //add the proc to the group.
4439  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4440 
4441  if(proc_id_to_assign == this->myRank){
4442  //if the assigned process is me, then I find my group.
4443  did_i_find_my_group = true;
4444  //set the beginning of part i to my rank.
4445  part_assignment_proc_begin_indices[i] = this->myRank;
4446  processor_chains_in_parts[this->myRank] = -1;
4447 
4448  //set send count to myself to the number of points that I have in part i.
4449  send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4450 
4451  //calculate the shift required for the output_part_numbering_begin_index
4452  for (mj_part_t in = 0; in < i; ++in){
4453  output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4454  }
4455  out_part_index = i;
4456  }
4457  }
4458  //if these was not my group,
4459  //clear the subcomminicator processor array.
4460  if (!did_i_find_my_group){
4461  processor_ranks_for_subcomm.clear();
4462  }
4463  }
4464 
4465  //send points of the nonassigned coordinates to the assigned coordinates.
4466  //starts from the heaviest nonassigned processor.
4467  //TODO we might want to play with this part, that allows more computational imbalance
4468  //but having better communication balance.
4469  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4470  mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4471  mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4472 
4473  //we set number of points to -to_sent - 1 for the assigned processors.
4474  //we reverse it here. This should not happen, as we have already reversed them above.
4475 #ifdef MJ_DEBUG
4476  if (num_points_to_sent < 0) {
4477  std::cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
4478  exit(1);
4479  }
4480 #endif
4481 
4482  switch (migration_type){
4483  case 0:
4484  {
4485  //now sends the points to the assigned processors.
4486  while (num_points_to_sent > 0){
4487  //if the processor has enough space.
4488  if (num_points_to_sent <= space_left_in_sent_proc){
4489  //reduce the space left in the processor.
4490  space_left_in_sent_proc -= num_points_to_sent;
4491  //if my rank is the one that is sending the coordinates.
4492  if (this->myRank == nonassigned_proc_id){
4493  //set my sent count to the sent processor.
4494  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4495  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4496  //that the processor will send its point in part-i.
4497  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4498  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4499  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4500  }
4501  num_points_to_sent = 0;
4502  }
4503  else {
4504  //there might be no space left in the processor.
4505  if(space_left_in_sent_proc > 0){
4506  num_points_to_sent -= space_left_in_sent_proc;
4507 
4508  //send as the space left in the processor.
4509  if (this->myRank == nonassigned_proc_id){
4510  //send as much as the space in this case.
4511  send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4512  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4513  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4514  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4515 
4516  }
4517  }
4518  //change the sent part
4519  ++next_proc_to_send_index;
4520 
4521 #ifdef MJ_DEBUG
4522  if(next_part_to_send_index < nprocs - required_proc_count ){
4523  std::cout << "Migration - processor assignments - for part:"
4524  << i
4525  << " next_part_to_send :" << next_part_to_send_index
4526  << " nprocs:" << nprocs
4527  << " required_proc_count:" << required_proc_count
4528  << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4529  exit(1)l
4530 
4531  }
4532 #endif
4533  //send the new id.
4534  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4535  //set the new space in the processor.
4536  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4537  }
4538  }
4539  }
4540  break;
4541  default:
4542  {
4543  //to minimize messages, we want each processor to send its coordinates to only a single point.
4544  //we do not respect imbalances here, we send all points to the next processor.
4545  if (this->myRank == nonassigned_proc_id){
4546  //set my sent count to the sent processor.
4547  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4548  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4549  //that the processor will send its point in part-i.
4550  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4551  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4552  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4553  }
4554  num_points_to_sent = 0;
4555  ++next_proc_to_send_index;
4556 
4557  //if we made it to the heaviest processor we round robin and go to beginning
4558  if (next_proc_to_send_index == num_procs){
4559  next_proc_to_send_index = num_procs - required_proc_count;
4560  }
4561  //send the new id.
4562  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4563  //set the new space in the processor.
4564  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4565  }
4566  }
4567  }
4568  }
4569 
4570  /*
4571  for (int i = 0; i < num_procs;++i){
4572  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" << send_count_to_each_proc[i] << std::endl;
4573  }
4574  */
4575 
4576 
4577  this->assign_send_destinations(
4578  num_parts,
4579  part_assignment_proc_begin_indices,
4580  processor_chains_in_parts,
4581  send_count_to_each_proc,
4582  coordinate_destinations);
4583 
4584  freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4585  freeArray<mj_part_t>(processor_chains_in_parts);
4586  freeArray<mj_part_t>(processor_part_assignments);
4587  freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4588  freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4589 
4590 }
4591 
4592 
4605 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4606  typename mj_part_t>
4607 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4608  mj_part_t num_parts,
4609  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
4610  int *coordinate_destinations,
4611  mj_part_t &output_part_numbering_begin_index,
4612  std::vector<mj_part_t> *next_future_num_parts_in_parts){
4613 
4614  mj_part_t part_shift_amount = output_part_numbering_begin_index;
4615  mj_part_t previous_processor = -1;
4616  for(mj_part_t i = 0; i < num_parts; ++i){
4617  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4618  //assigned processors are sorted.
4619  mj_lno_t part_begin_index = 0;
4620  if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4621  mj_lno_t part_end_index = this->new_part_xadj[p];
4622 
4623  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4624  if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4625  output_part_numbering_begin_index = part_shift_amount;
4626  }
4627  previous_processor = assigned_proc;
4628  part_shift_amount += (*next_future_num_parts_in_parts)[p];
4629 
4630  for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4631  mj_lno_t localInd = this->new_coordinate_permutations[j];
4632  coordinate_destinations[localInd] = assigned_proc;
4633  }
4634  }
4635 }
4636 
4637 
4654 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4655  typename mj_part_t>
4656 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4657  mj_gno_t * num_points_in_all_processor_parts,
4658  mj_part_t num_parts,
4659  mj_part_t num_procs,
4660  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
4661  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
4662  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
4663  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
4664  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
4665  int *coordinate_destinations){
4666  out_num_part = 0;
4667 
4668  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4669  out_part_indices.clear();
4670 
4671  //to sort the parts that is assigned to the processors.
4672  //id is the part number, sort value is the assigned processor id.
4673  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4674  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4675 
4676 
4677  //calculate the optimal number of coordinates that should be assigned to each processor.
4678  mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
4679  //to hold the left space as the number of coordinates to the optimal number in each proc.
4680  mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4681  //initialize left space in each.
4682  for (mj_part_t i = 0; i < num_procs; ++i){
4683  space_in_each_processor[i] = work_each;
4684  }
4685 
4686  //we keep track of how many parts each processor is assigned to.
4687  //because in some weird inputs, it might be possible that some
4688  //processors is not assigned to any part. Using these variables,
4689  //we force each processor to have at least one part.
4690  mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4691  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
4692  int empty_proc_count = num_procs;
4693 
4694  //to sort the parts with decreasing order of their coordiantes.
4695  //id are the part numbers, sort value is the number of points in each.
4696  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4697 
4698  //initially we will sort the parts according to the number of coordinates they have.
4699  //so that we will start assigning with the part that has the most number of coordinates.
4700  for (mj_part_t i = 0; i < num_parts; ++i){
4701  sort_item_point_counts_in_parts[i].id = i;
4702  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4703  }
4704  //sort parts with increasing order of loads.
4705  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4706 
4707 
4708  //assigning parts to the processors
4709  //traverse the part win decreasing order of load.
4710  //first assign the heaviest part.
4711  for (mj_part_t j = 0; j < num_parts; ++j){
4712  //sorted with increasing order, traverse inverse.
4713  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4714  //load of the part
4715  mj_gno_t load = global_num_points_in_parts[i];
4716 
4717  //assigned processors
4718  mj_part_t assigned_proc = -1;
4719  //if not fit best processor.
4720  mj_part_t best_proc_to_assign = 0;
4721 
4722 
4723  //sort processors with increasing number of points in this part.
4724  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4725  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4726 
4727  //if there are still enough parts to fill empty processors, than proceed normally.
4728  //but if empty processor count is equal to the number of part, then
4729  //we force to part assignments only to empty processors.
4730  if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4731  //how many points processor ii has in part i?
4732  sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4733  }
4734  else {
4735  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4736  }
4737  }
4738  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4739 
4740  //traverse all processors with decreasing load.
4741  for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4742  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4743  mj_lno_t left_space = space_in_each_processor[ii] - load;
4744  //if enought space, assign to this part.
4745  if(left_space >= 0 ){
4746  assigned_proc = ii;
4747  break;
4748  }
4749  //if space is not enough, store the best candidate part.
4750  if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4751  best_proc_to_assign = ii;
4752  }
4753  }
4754 
4755  //if none had enough space, then assign it to best part.
4756  if (assigned_proc == -1){
4757  assigned_proc = best_proc_to_assign;
4758  }
4759 
4760  if (num_parts_proc_assigned[assigned_proc]++ == 0){
4761  --empty_proc_count;
4762  }
4763  space_in_each_processor[assigned_proc] -= load;
4764  //to sort later, part-i is assigned to the proccessor - assignment.
4765  sort_item_part_to_proc_assignment[j].id = i; //part i
4766  sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
4767 
4768 
4769  //if assigned processor is me, increase the number.
4770  if (assigned_proc == this->myRank){
4771  out_num_part++;//assigned_part_count;
4772  out_part_indices.push_back(i);
4773  }
4774  //increase the send to that processor by the number of points in that part.
4775  //as everyone send their coordiantes in this part to the processor assigned to this part.
4776  send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4777  }
4778  freeArray<mj_part_t>(num_parts_proc_assigned);
4779  freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4780  freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4781  freeArray<mj_lno_t >(space_in_each_processor);
4782 
4783 
4784  //sort assignments with respect to the assigned processors.
4785  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4786  //fill sendBuf.
4787 
4788 
4789  this->assign_send_destinations2(
4790  num_parts,
4791  sort_item_part_to_proc_assignment,
4792  coordinate_destinations,
4793  output_part_numbering_begin_index,
4794  next_future_num_parts_in_parts);
4795 
4796  freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4797 }
4798 
4799 
4817 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4818  typename mj_part_t>
4819 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
4820  mj_gno_t * num_points_in_all_processor_parts,
4821  mj_part_t num_parts,
4822  mj_part_t num_procs,
4823  mj_lno_t *send_count_to_each_proc,
4824  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4825  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4826  mj_part_t &out_num_part,
4827  std::vector<mj_part_t> &out_part_indices,
4828  mj_part_t &output_part_numbering_begin_index,
4829  int *coordinate_destinations){
4830 
4831 
4832 
4833  processor_ranks_for_subcomm.clear();
4834  // if (this->num_local_coords > 0)
4835  if (num_procs > num_parts){
4836  //if there are more processors than the number of current part
4837  //then processors share the existing parts.
4838  //at the end each processor will have a single part,
4839  //but a part will be shared by a group of processors.
4840  mj_part_t out_part_index = 0;
4841  this->mj_assign_proc_to_parts(
4842  num_points_in_all_processor_parts,
4843  num_parts,
4844  num_procs,
4845  send_count_to_each_proc,
4846  processor_ranks_for_subcomm,
4847  next_future_num_parts_in_parts,
4848  out_part_index,
4849  output_part_numbering_begin_index,
4850  coordinate_destinations
4851  );
4852 
4853  out_num_part = 1;
4854  out_part_indices.clear();
4855  out_part_indices.push_back(out_part_index);
4856  }
4857  else {
4858 
4859  //there are more parts than the processors.
4860  //therefore a processor will be assigned multiple parts,
4861  //the subcommunicators will only have a single processor.
4862  processor_ranks_for_subcomm.push_back(this->myRank);
4863 
4864  //since there are more parts then procs,
4865  //assign multiple parts to processors.
4866  this->mj_assign_parts_to_procs(
4867  num_points_in_all_processor_parts,
4868  num_parts,
4869  num_procs,
4870  send_count_to_each_proc,
4871  next_future_num_parts_in_parts,
4872  out_num_part,
4873  out_part_indices,
4874  output_part_numbering_begin_index,
4875  coordinate_destinations);
4876  }
4877 }
4878 
4891 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4892  typename mj_part_t>
4893 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
4894  mj_part_t num_procs,
4895  mj_lno_t &num_new_local_points,
4896  std::string iteration,
4897  int *coordinate_destinations,
4898  mj_part_t num_parts)
4899 {
4900 #ifdef ENABLE_ZOLTAN_MIGRATION
4901  if (sizeof(mj_lno_t) <= sizeof(int)) {
4902 
4903  // Cannot use Zoltan_Comm with local ordinals larger than ints.
4904  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
4905  // may overflow.
4906 
4907  ZOLTAN_COMM_OBJ *plan = NULL;
4908  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4909  int num_incoming_gnos = 0;
4910  int message_tag = 7859;
4911 
4912  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4913  int ierr = Zoltan_Comm_Create(
4914  &plan,
4915  int(this->num_local_coords),
4916  coordinate_destinations,
4917  mpi_comm,
4918  message_tag,
4919  &num_incoming_gnos);
4920  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4921  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4922 
4923  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4924  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4925 
4926  //migrate gnos.
4927  message_tag++;
4928  ierr = Zoltan_Comm_Do(
4929  plan,
4930  message_tag,
4931  (char *) this->current_mj_gnos,
4932  sizeof(mj_gno_t),
4933  (char *) incoming_gnos);
4934  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4935 
4936  freeArray<mj_gno_t>(this->current_mj_gnos);
4937  this->current_mj_gnos = incoming_gnos;
4938 
4939 
4940  //migrate coordinates
4941  for (int i = 0; i < this->coord_dim; ++i){
4942  message_tag++;
4943  mj_scalar_t *coord = this->mj_coordinates[i];
4944 
4945  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4946  ierr = Zoltan_Comm_Do(
4947  plan,
4948  message_tag,
4949  (char *) coord,
4950  sizeof(mj_scalar_t),
4951  (char *) this->mj_coordinates[i]);
4952  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4953  freeArray<mj_scalar_t>(coord);
4954  }
4955 
4956  //migrate weights.
4957  for (int i = 0; i < this->num_weights_per_coord; ++i){
4958  message_tag++;
4959  mj_scalar_t *weight = this->mj_weights[i];
4960 
4961  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4962  ierr = Zoltan_Comm_Do(
4963  plan,
4964  message_tag,
4965  (char *) weight,
4966  sizeof(mj_scalar_t),
4967  (char *) this->mj_weights[i]);
4968  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4969  freeArray<mj_scalar_t>(weight);
4970  }
4971 
4972 
4973  //migrate owners.
4974  int *coord_own = allocMemory<int>(num_incoming_gnos);
4975  message_tag++;
4976  ierr = Zoltan_Comm_Do(
4977  plan,
4978  message_tag,
4979  (char *) this->owner_of_coordinate,
4980  sizeof(int), (char *) coord_own);
4981  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4982  freeArray<int>(this->owner_of_coordinate);
4983  this->owner_of_coordinate = coord_own;
4984 
4985 
4986  //if num procs is less than num parts,
4987  //we need the part assigment arrays as well, since
4988  //there will be multiple parts in processor.
4989  mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4990  if(num_procs < num_parts){
4991  message_tag++;
4992  ierr = Zoltan_Comm_Do(
4993  plan,
4994  message_tag,
4995  (char *) this->assigned_part_ids,
4996  sizeof(mj_part_t),
4997  (char *) new_parts);
4998  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4999  }
5000  freeArray<mj_part_t>(this->assigned_part_ids);
5001  this->assigned_part_ids = new_parts;
5002 
5003  ierr = Zoltan_Comm_Destroy(&plan);
5004  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5005  num_new_local_points = num_incoming_gnos;
5006  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
5007  }
5008 
5009  else
5010 
5011 #endif // ENABLE_ZOLTAN_MIGRATION
5012  {
5013  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5014  Tpetra::Distributor distributor(this->comm);
5015  ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5016  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5017  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5018 
5019  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5020  {
5021  //migrate gnos.
5022  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5023  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5024  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5025  freeArray<mj_gno_t>(this->current_mj_gnos);
5026  this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5027  memcpy(
5028  this->current_mj_gnos,
5029  received_gnos.getRawPtr(),
5030  num_incoming_gnos * sizeof(mj_gno_t));
5031  }
5032  //migrate coordinates
5033  for (int i = 0; i < this->coord_dim; ++i){
5034 
5035  ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5036  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5037  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5038  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5039  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5040  memcpy(
5041  this->mj_coordinates[i],
5042  received_coord.getRawPtr(),
5043  num_incoming_gnos * sizeof(mj_scalar_t));
5044  }
5045 
5046  //migrate weights.
5047  for (int i = 0; i < this->num_weights_per_coord; ++i){
5048 
5049  ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5050  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5051  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5052  freeArray<mj_scalar_t>(this->mj_weights[i]);
5053  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5054  memcpy(
5055  this->mj_weights[i],
5056  received_weight.getRawPtr(),
5057  num_incoming_gnos * sizeof(mj_scalar_t));
5058  }
5059 
5060  {
5061  //migrate the owners of the coordinates
5062  ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5063  ArrayRCP<int> received_owners(num_incoming_gnos);
5064  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
5065  freeArray<int>(this->owner_of_coordinate);
5066  this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5067  memcpy(
5068  this->owner_of_coordinate,
5069  received_owners.getRawPtr(),
5070  num_incoming_gnos * sizeof(int));
5071  }
5072 
5073  //if num procs is less than num parts,
5074  //we need the part assigment arrays as well, since
5075  //there will be multiple parts in processor.
5076  if(num_procs < num_parts){
5077  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5078  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5079  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5080  freeArray<mj_part_t>(this->assigned_part_ids);
5081  this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5082  memcpy(
5083  this->assigned_part_ids,
5084  received_partids.getRawPtr(),
5085  num_incoming_gnos * sizeof(mj_part_t));
5086  }
5087  else {
5088  mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5089  freeArray<mj_part_t>(this->assigned_part_ids);
5090  this->assigned_part_ids = new_parts;
5091  }
5092  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5093  num_new_local_points = num_incoming_gnos;
5094 
5095  }
5096 }
5097 
5104 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5105  typename mj_part_t>
5106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5107  mj_part_t group_size = processor_ranks_for_subcomm.size();
5108  mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5109  for(mj_part_t i = 0; i < group_size; ++i) {
5110  ids[i] = processor_ranks_for_subcomm[i];
5111  }
5112  ArrayView<const mj_part_t> idView(ids, group_size);
5113  this->comm = this->comm->createSubcommunicator(idView);
5114  freeArray<mj_part_t>(ids);
5115 }
5116 
5117 
5123 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5124  typename mj_part_t>
5125 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5126  mj_part_t output_num_parts,
5127  mj_part_t num_parts){
5128  //if there is single output part, then simply fill the permutation array.
5129  if (output_num_parts == 1){
5130  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5131  this->new_coordinate_permutations[i] = i;
5132  }
5133  this->new_part_xadj[0] = this->num_local_coords;
5134  }
5135  else {
5136 
5137  //otherwise we need to count how many points are there in each part.
5138  //we allocate here as num_parts, because the sent partids are up to num_parts,
5139  //although there are outout_num_parts different part.
5140  mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5141  //part shift holds the which part number an old part number corresponds to.
5142  mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5143 
5144  memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
5145 
5146  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5147  mj_part_t ii = this->assigned_part_ids[i];
5148  ++num_points_in_parts[ii];
5149  }
5150 
5151  //write the end points of the parts.
5152  mj_part_t p = 0;
5153  mj_lno_t prev_index = 0;
5154  for(mj_part_t i = 0; i < num_parts; ++i){
5155  if(num_points_in_parts[i] > 0) {
5156  this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5157  prev_index += num_points_in_parts[i];
5158  part_shifts[i] = p++;
5159  }
5160  }
5161 
5162  //for the rest of the parts write the end index as end point.
5163  mj_part_t assigned_num_parts = p - 1;
5164  for (;p < num_parts; ++p){
5165  this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5166  }
5167  for(mj_part_t i = 0; i < output_num_parts; ++i){
5168  num_points_in_parts[i] = this->new_part_xadj[i];
5169  }
5170 
5171  //write the permutation array here.
5172  //get the part of the coordinate i, shift it to obtain the new part number.
5173  //assign it to the end of the new part numbers pointer.
5174  for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5175  mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5176  this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5177  }
5178 
5179  freeArray<mj_lno_t>(num_points_in_parts);
5180  freeArray<mj_part_t>(part_shifts);
5181  }
5182 }
5183 
5184 
5207 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5208  typename mj_part_t>
5209 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5210  mj_part_t input_num_parts, //current umb parts
5211  mj_part_t &output_num_parts, //output umb parts.
5212  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5213  mj_part_t &output_part_begin_index,
5214  size_t migration_reduce_all_population,
5215  mj_lno_t num_coords_for_last_dim_part,
5216  std::string iteration,
5217  RCP<mj_partBoxVector_t> &input_part_boxes,
5218  RCP<mj_partBoxVector_t> &output_part_boxes
5219 )
5220 {
5221  mj_part_t num_procs = this->comm->getSize();
5222  this->myRank = this->comm->getRank();
5223 
5224 
5225  //this array holds how many points each processor has in each part.
5226  //to access how many points processor i has on part j,
5227  //num_points_in_all_processor_parts[i * num_parts + j]
5228  mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5229 
5230  //get the number of coordinates in each part in each processor.
5231  this->get_processor_num_points_in_parts(
5232  num_procs,
5233  input_num_parts,
5234  num_points_in_all_processor_parts);
5235 
5236 
5237  //check if migration will be performed or not.
5238  if (!this->mj_check_to_migrate(
5239  migration_reduce_all_population,
5240  num_coords_for_last_dim_part,
5241  num_procs,
5242  input_num_parts,
5243  num_points_in_all_processor_parts)){
5244  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5245  return false;
5246  }
5247 
5248 
5249  mj_lno_t *send_count_to_each_proc = NULL;
5250  int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5251  send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5252  for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5253 
5254  std::vector<mj_part_t> processor_ranks_for_subcomm;
5255  std::vector<mj_part_t> out_part_indices;
5256 
5257  //determine which processors are assigned to which parts
5258  this->mj_migration_part_proc_assignment(
5259  num_points_in_all_processor_parts,
5260  input_num_parts,
5261  num_procs,
5262  send_count_to_each_proc,
5263  processor_ranks_for_subcomm,
5264  next_future_num_parts_in_parts,
5265  output_num_parts,
5266  out_part_indices,
5267  output_part_begin_index,
5268  coordinate_destinations);
5269 
5270 
5271 
5272 
5273  freeArray<mj_lno_t>(send_count_to_each_proc);
5274  std::vector <mj_part_t> tmpv;
5275 
5276  std::sort (out_part_indices.begin(), out_part_indices.end());
5277  mj_part_t outP = out_part_indices.size();
5278 
5279  mj_gno_t new_global_num_points = 0;
5280  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5281 
5282  if (this->mj_keep_part_boxes){
5283  input_part_boxes->clear();
5284  }
5285 
5286  //now we calculate the new values for next_future_num_parts_in_parts.
5287  //same for the part boxes.
5288  for (mj_part_t i = 0; i < outP; ++i){
5289  mj_part_t ind = out_part_indices[i];
5290  new_global_num_points += global_num_points_in_parts[ind];
5291  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5292  if (this->mj_keep_part_boxes){
5293  input_part_boxes->push_back((*output_part_boxes)[ind]);
5294  }
5295  }
5296  //swap the input and output part boxes.
5297  if (this->mj_keep_part_boxes){
5298  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5299  input_part_boxes = output_part_boxes;
5300  output_part_boxes = tmpPartBoxes;
5301  }
5302  next_future_num_parts_in_parts->clear();
5303  for (mj_part_t i = 0; i < outP; ++i){
5304  mj_part_t p = tmpv[i];
5305  next_future_num_parts_in_parts->push_back(p);
5306  }
5307 
5308  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5309 
5310  mj_lno_t num_new_local_points = 0;
5311 
5312 
5313  //perform the actual migration operation here.
5314  this->mj_migrate_coords(
5315  num_procs,
5316  num_new_local_points,
5317  iteration,
5318  coordinate_destinations,
5319  input_num_parts);
5320 
5321 
5322  freeArray<int>(coordinate_destinations);
5323 
5324  if(this->num_local_coords != num_new_local_points){
5325  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5326  freeArray<mj_lno_t>(this->coordinate_permutations);
5327 
5328  this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5329  this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5330  }
5331  this->num_local_coords = num_new_local_points;
5332  this->num_global_coords = new_global_num_points;
5333 
5334 
5335 
5336  //create subcommunicator.
5337  this->create_sub_communicator(processor_ranks_for_subcomm);
5338  processor_ranks_for_subcomm.clear();
5339 
5340  //fill the new permutation arrays.
5341  this->fill_permutation_array(
5342  output_num_parts,
5343  input_num_parts);
5344  return true;
5345 }
5346 
5347 
5361 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5362  typename mj_part_t>
5363 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5364  mj_part_t num_parts,
5365  mj_scalar_t *mj_current_dim_coords,
5366  mj_scalar_t *current_concurrent_cut_coordinate,
5367  mj_lno_t coordinate_begin,
5368  mj_lno_t coordinate_end,
5369  mj_scalar_t *used_local_cut_line_weight_to_left,
5370  mj_lno_t *out_part_xadj,
5371  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5372 
5373  //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin;
5374  mj_part_t no_cuts = num_parts - 1;
5375 
5376 
5377 
5378  int me = 0;
5379  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5380  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5381 
5382 
5383  //now if the rectilinear partitioning is allowed we decide how
5384  //much weight each thread should put to left and right.
5385  if (this->distribute_points_on_cut_lines){
5386 
5387  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5388  for (mj_part_t i = 0; i < no_cuts; ++i){
5389  //the left to be put on the left of the cut.
5390  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5391  //std::cout << "i:" << i << " left_weight:" << left_weight << std::endl;
5392  for(int ii = 0; ii < this->num_threads; ++ii){
5393  if(left_weight > this->sEpsilon){
5394  //the weight of thread ii on cut.
5395  mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5396  if(thread_ii_weight_on_cut < left_weight){
5397  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5398  }
5399  else {
5400  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5401  }
5402  left_weight -= thread_ii_weight_on_cut;
5403  }
5404  else {
5405  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5406  }
5407  }
5408  }
5409 
5410  if(no_cuts > 0){
5411  //this is a special case. If cutlines share the same coordinate, their weights are equal.
5412  //we need to adjust the ratio for that.
5413  for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5414  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5415  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5416  }
5417  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
5418  / mj_scalar_t(SIGNIFICANCE_MUL);
5419  }
5420  }
5421  }
5422 
5423  for(mj_part_t ii = 0; ii < num_parts; ++ii){
5424  thread_num_points_in_parts[ii] = 0;
5425  }
5426 
5427  //for this specific case we dont want to distribute the points along the cut position
5428  //randomly, as we need a specific ordering of them. Instead,
5429  //we put the coordinates into a sort item, where we sort those
5430  //using the coordinates of points on other dimensions and the index.
5431 
5432 
5433  //some of the cuts might share the same position.
5434  //in this case, if cut i and cut j share the same position
5435  //cut_map[i] = cut_map[j] = sort item index.
5436  mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5437 
5438 
5439  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5440  typedef std::vector< multiSItem > multiSVector;
5441  typedef std::vector<multiSVector> multiS2Vector;
5442 
5443  //to keep track of the memory allocated.
5444  std::vector<mj_scalar_t *>allocated_memory;
5445 
5446  //vector for which the coordinates will be sorted.
5447  multiS2Vector sort_vector_points_on_cut;
5448 
5449  //the number of cuts that have different coordinates.
5450  mj_part_t different_cut_count = 1;
5451  cut_map[0] = 0;
5452 
5453  //now we insert 1 sort vector for all cuts on the different
5454  //positins.if multiple cuts are on the same position, they share sort vectors.
5455  multiSVector tmpMultiSVector;
5456  sort_vector_points_on_cut.push_back(tmpMultiSVector);
5457 
5458  for (mj_part_t i = 1; i < no_cuts ; ++i){
5459  //if cuts share the same cut coordinates
5460  //set the cutmap accordingly.
5461  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5462  cut_map[i] = cut_map[i-1];
5463  }
5464  else {
5465  cut_map[i] = different_cut_count++;
5466  multiSVector tmp2MultiSVector;
5467  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5468  }
5469  }
5470 
5471 
5472  //now the actual part assigment.
5473  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5474 
5475  mj_lno_t i = this->coordinate_permutations[ii];
5476 
5477  mj_part_t pp = this->assigned_part_ids[i];
5478  mj_part_t p = pp / 2;
5479  //if the coordinate is on a cut.
5480  if(pp % 2 == 1 ){
5481  mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5482  allocated_memory.push_back(vals);
5483 
5484  //we insert the coordinates to the sort item here.
5485  int val_ind = 0;
5486 
5487  if (longest_dim_part){
5488  //std::cout << std::endl << std::endl;
5489  for(int dim = this->coord_dim - 2; dim >= 0; --dim){
5490  //uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted
5491  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5492  //std::cout << "next_largest_coord_dim: " << next_largest_coord_dim << " ";
5493  vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5494  }
5495  }
5496  else {
5497  for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5498  vals[val_ind++] = this->mj_coordinates[dim][i];
5499  }
5500  for(int dim = 0; dim < coordInd; ++dim){
5501  vals[val_ind++] = this->mj_coordinates[dim][i];
5502  }
5503  }
5504  multiSItem tempSortItem(i, this->coord_dim -1, vals);
5505  //inser the point to the sort vector pointed by the cut_map[p].
5506  mj_part_t cmap = cut_map[p];
5507  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5508  }
5509  else {
5510  //if it is not on the cut, simple sorting.
5511  ++thread_num_points_in_parts[p];
5512  this->assigned_part_ids[i] = p;
5513  }
5514  }
5515 
5516  //sort all the sort vectors.
5517  for (mj_part_t i = 0; i < different_cut_count; ++i){
5518  std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5519  }
5520 
5521  //we do the part assignment for the points on cuts here.
5522  mj_part_t previous_cut_map = cut_map[0];
5523 
5524  //this is how much previous part owns the weight of the current part.
5525  //when target part weight is 1.6, and the part on the left is given 2,
5526  //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
5527  //this parameter is used to balance this issues.
5528  //in the above example weight_stolen_from_previous_part will be 0.4.
5529  //if the left part target is 2.2 but it is given 2,
5530  //then weight_stolen_from_previous_part will be -0.2.
5531  mj_scalar_t weight_stolen_from_previous_part = 0;
5532  for (mj_part_t p = 0; p < no_cuts; ++p){
5533 
5534  mj_part_t mapped_cut = cut_map[p];
5535 
5536  //if previous cut map is done, and it does not have the same index,
5537  //then assign all points left on that cut to its right.
5538  if (previous_cut_map != mapped_cut){
5539  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5540  for (; sort_vector_end >= 0; --sort_vector_end){
5541  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5542  mj_lno_t i = t.index;
5543  ++thread_num_points_in_parts[p];
5544  this->assigned_part_ids[i] = p;
5545  }
5546  sort_vector_points_on_cut[previous_cut_map].clear();
5547  }
5548 
5549  //TODO: MD: I dont remember why I have it reverse order here.
5550  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5551  //mj_lno_t sort_vector_begin= 0;
5552  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
5553 
5554  //TODO commented for reverse order
5555  for (; sort_vector_end >= 0; --sort_vector_end){
5556  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5557  //TODO COMMENTED FOR REVERSE ORDER
5558  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5559  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
5560  mj_lno_t i = t.index;
5561  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5562 
5563 
5564  //part p has enough space for point i, then put it to point i.
5565  if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5566  my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5567  > this->sEpsilon){
5568 
5569  my_local_thread_cut_weights_to_put_left[p] -= w;
5570  sort_vector_points_on_cut[mapped_cut].pop_back();
5571  ++thread_num_points_in_parts[p];
5572  this->assigned_part_ids[i] = p;
5573  //if putting this weight to left overweights the left cut, then
5574  //increase the space for the next cut using weight_stolen_from_previous_part.
5575  if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5576  if(mapped_cut == cut_map[p + 1] ){
5577  //if the cut before the cut indexed at p was also at the same position
5578  //special case, as we handle the weight differently here.
5579  if (previous_cut_map != mapped_cut){
5580  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5581  }
5582  else {
5583  //if the cut before the cut indexed at p was also at the same position
5584  //we assign extra weights cumulatively in this case.
5585  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5586  }
5587  }
5588  else{
5589  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5590  }
5591  //end assignment for part p
5592  break;
5593  }
5594  } else {
5595  //if part p does not have enough space for this point
5596  //and if there is another cut sharing the same positon,
5597  //again increase the space for the next
5598  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5599  if (previous_cut_map != mapped_cut){
5600  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5601  }
5602  else {
5603  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5604  }
5605  }
5606  else{
5607  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5608  }
5609  //end assignment for part p
5610  break;
5611  }
5612  }
5613  previous_cut_map = mapped_cut;
5614  }
5615 
5616  //TODO commented for reverse order
5617  //put everything left on the last cut to the last part.
5618  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5619 
5620  //mj_lno_t sort_vector_begin= 0;
5621  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size();
5622  //TODO commented for reverse order
5623  for (; sort_vector_end >= 0; --sort_vector_end){
5624  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5625  //TODO commented for reverse order
5626  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5627  //multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
5628  mj_lno_t i = t.index;
5629  ++thread_num_points_in_parts[no_cuts];
5630  this->assigned_part_ids[i] = no_cuts;
5631  }
5632  sort_vector_points_on_cut[previous_cut_map].clear();
5633  freeArray<mj_part_t> (cut_map);
5634 
5635  //free the memory allocated for vertex sort items .
5636  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5637  for(mj_lno_t i = 0; i < vSize; ++i){
5638  freeArray<mj_scalar_t> (allocated_memory[i]);
5639  }
5640 
5641  //creation of part_xadj as in usual case.
5642  for(mj_part_t j = 0; j < num_parts; ++j){
5643  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5644  for (int i = 0; i < this->num_threads; ++i){
5645  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5646  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5647  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5648 
5649  }
5650  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
5651  }
5652 
5653  //perform prefix sum for num_points in parts.
5654  for(mj_part_t j = 1; j < num_parts; ++j){
5655  out_part_xadj[j] += out_part_xadj[j - 1];
5656  }
5657 
5658 
5659  //shift the num points in threads thread to obtain the
5660  //beginning index of each thread's private space.
5661  for(mj_part_t j = 1; j < num_parts; ++j){
5662  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5663  }
5664 
5665  //now thread gets the coordinate and writes the index of coordinate to the permutation array
5666  //using the part index we calculated.
5667  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5668  mj_lno_t i = this->coordinate_permutations[ii];
5669  mj_part_t p = this->assigned_part_ids[i];
5670  this->new_coordinate_permutations[coordinate_begin +
5671  thread_num_points_in_parts[p]++] = i;
5672  }
5673 }
5674 
5675 
5676 
5686 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5687  typename mj_part_t>
5688 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5689  mj_part_t current_num_parts,
5690  mj_part_t output_part_begin_index,
5691  RCP<mj_partBoxVector_t> &output_part_boxes,
5692  bool is_data_ever_migrated)
5693 {
5694  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5695 
5696 #ifdef HAVE_ZOLTAN2_OMP
5697 #pragma omp parallel for
5698 #endif
5699  for(mj_part_t i = 0; i < current_num_parts;++i){
5700 
5701  mj_lno_t begin = 0;
5702  mj_lno_t end = this->part_xadj[i];
5703 
5704  if(i > 0) begin = this->part_xadj[i -1];
5705  mj_part_t part_to_set_index = i + output_part_begin_index;
5706  if (this->mj_keep_part_boxes){
5707  (*output_part_boxes)[i].setpId(part_to_set_index);
5708  }
5709  for (mj_lno_t ii = begin; ii < end; ++ii){
5710  mj_lno_t k = this->coordinate_permutations[ii];
5711  this->assigned_part_ids[k] = part_to_set_index;
5712  }
5713  }
5714 
5715  //ArrayRCP<const mj_gno_t> gnoList;
5716  if(!is_data_ever_migrated){
5717  //freeArray<mj_gno_t>(this->current_mj_gnos);
5718  //if(this->num_local_coords > 0){
5719  // gnoList = arcpFromArrayView(this->mj_gnos);
5720  //}
5721  }
5722  else {
5723 #ifdef ENABLE_ZOLTAN_MIGRATION
5724  if (sizeof(mj_lno_t) <= sizeof(int)) {
5725 
5726  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5727  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5728  // may overflow.
5729 
5730  //if data is migrated, then send part numbers to the original owners.
5731  ZOLTAN_COMM_OBJ *plan = NULL;
5732  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5733 
5734  int incoming = 0;
5735  int message_tag = 7856;
5736 
5737  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
5738  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
5739  this->owner_of_coordinate, mpi_comm, message_tag,
5740  &incoming);
5741  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5742  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
5743 
5744  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5745 
5746  message_tag++;
5747  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5748  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
5749  sizeof(mj_gno_t), (char *) incoming_gnos);
5750  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5751 
5752  freeArray<mj_gno_t>(this->current_mj_gnos);
5753  this->current_mj_gnos = incoming_gnos;
5754 
5755  mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5756 
5757  message_tag++;
5758  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
5759  sizeof(mj_part_t), (char *) incoming_partIds);
5760  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5761  freeArray<mj_part_t>(this->assigned_part_ids);
5762  this->assigned_part_ids = incoming_partIds;
5763 
5764  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5765  ierr = Zoltan_Comm_Destroy(&plan);
5766  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5767 
5768  this->num_local_coords = incoming;
5769  //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
5770  }
5771  else
5772 
5773 #endif // !ENABLE_ZOLTAN_MIGRATION
5774  {
5775  //if data is migrated, then send part numbers to the original owners.
5776  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
5777  Tpetra::Distributor distributor(this->mj_problemComm);
5778  ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5779  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5780  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
5781 
5782  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5783  //migrate gnos to actual owners.
5784  ArrayRCP<mj_gno_t> received_gnos(incoming);
5785  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5786  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5787  freeArray<mj_gno_t>(this->current_mj_gnos);
5788  this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5789  memcpy( this->current_mj_gnos,
5790  received_gnos.getRawPtr(),
5791  incoming * sizeof(mj_gno_t));
5792 
5793  //migrate part ids to actual owners.
5794  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5795  ArrayRCP<mj_part_t> received_partids(incoming);
5796  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5797  freeArray<mj_part_t>(this->assigned_part_ids);
5798  this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5799  memcpy( this->assigned_part_ids,
5800  received_partids.getRawPtr(),
5801  incoming * sizeof(mj_part_t));
5802 
5803  this->num_local_coords = incoming;
5804  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5805 
5806  }
5807  }
5808 
5809  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5810 
5811  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5812 
5813  //ArrayRCP<mj_part_t> partId;
5814  //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
5815 
5816  if (this->mj_keep_part_boxes){
5817  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5818 
5819  }
5820 
5821  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5822 }
5823 
5826 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5827  typename mj_part_t>
5828 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
5829  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
5830 
5831  for (int i=0; i < this->coord_dim; i++){
5832  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5833  }
5834  freeArray<mj_scalar_t *>(this->mj_coordinates);
5835 
5836  for (int i=0; i < this->num_weights_per_coord; i++){
5837  freeArray<mj_scalar_t>(this->mj_weights[i]);
5838  }
5839  freeArray<mj_scalar_t *>(this->mj_weights);
5840 
5841  freeArray<int>(this->owner_of_coordinate);
5842 
5843  for(int i = 0; i < this->num_threads; ++i){
5844  freeArray<mj_lno_t>(this->thread_point_counts[i]);
5845  }
5846 
5847  freeArray<mj_lno_t *>(this->thread_point_counts);
5848  freeArray<double *> (this->thread_part_weight_work);
5849 
5850  if(this->distribute_points_on_cut_lines){
5851  freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5852  for(int i = 0; i < this->num_threads; ++i){
5853  freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5854  }
5855  freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5856  freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5857  freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5858  }
5859 
5860  freeArray<mj_part_t>(this->my_incomplete_cut_count);
5861 
5862  freeArray<mj_scalar_t>(this->max_min_coords);
5863 
5864  freeArray<mj_lno_t>(this->part_xadj);
5865 
5866  freeArray<mj_lno_t>(this->coordinate_permutations);
5867 
5868  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5869 
5870  freeArray<mj_scalar_t>(this->all_cut_coordinates);
5871 
5872  freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5873 
5874  freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5875 
5876  freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5877 
5878  freeArray<mj_scalar_t>(this->target_part_weights);
5879 
5880  freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5881 
5882  freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5883 
5884  freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5885  freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5886  freeArray<bool>(this->is_cut_line_determined);
5887  freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5888  freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5889 
5890  for(int i = 0; i < this->num_threads; ++i){
5891  freeArray<double>(this->thread_part_weights[i]);
5892  freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5893  freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5894  }
5895 
5896  freeArray<double *>(this->thread_part_weights);
5897  freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5898  freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5899 
5900  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
5901 }
5902 
5911 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5912  typename mj_part_t>
5914  bool distribute_points_on_cut_lines_,
5915  int max_concurrent_part_calculation_,
5916  int check_migrate_avoid_migration_option_,
5917  mj_scalar_t minimum_migration_imbalance_,
5918  int migration_type_ ){
5919  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5920  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5921  this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5922  this->minimum_migration_imbalance = minimum_migration_imbalance_;
5923  this->migration_type = migration_type_;
5924 
5925 }
5926 
5927 
5928 
5929 
5958 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5959  typename mj_part_t>
5961 
5962  const RCP<const Environment> &env,
5963  RCP<const Comm<int> > &problemComm,
5964 
5965  double imbalance_tolerance_,
5966  size_t num_global_parts_,
5967  mj_part_t *part_no_array_,
5968  int recursion_depth_,
5969 
5970  int coord_dim_,
5971  mj_lno_t num_local_coords_,
5972  mj_gno_t num_global_coords_,
5973  const mj_gno_t *initial_mj_gnos_,
5974  mj_scalar_t **mj_coordinates_,
5975 
5976  int num_weights_per_coord_,
5977  bool *mj_uniform_weights_,
5978  mj_scalar_t **mj_weights_,
5979  bool *mj_uniform_parts_,
5980  mj_scalar_t **mj_part_sizes_,
5981 
5982  mj_part_t *&result_assigned_part_ids_,
5983  mj_gno_t *&result_mj_gnos_
5984 )
5985 {
5986 
5987 
5988 
5989 #ifdef print_debug
5990  if(comm->getRank() == 0){
5991  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
5992  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
5993  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
5994  }
5995 #endif
5996  this->mj_env = env;
5997  this->mj_problemComm = problemComm;
5998  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5999 
6000  /*
6001  if (0)
6002  {
6003  int a = rand();
6004  this->mj_problemComm->broadcast(0, sizeof(int), (char *) (&a));
6005  std::string istring = "output_" + Teuchos::toString<int>(a) + "_" + Teuchos::toString<int>(myRank) + ".mtx";
6006 
6007  std::ofstream output(istring.c_str());
6008  output << num_local_coords_ << " " << coord_dim_ << std::endl;
6009  for (int j = 0; j < coord_dim_ ; ++j){
6010  for (size_t i = 0; i < num_local_coords_; ++i){
6011  output << mj_coordinates_[j][i] << std::endl;
6012  }
6013 
6014  }
6015  output.close();
6016  }
6017  */
6018 
6019 
6020  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
6021  this->mj_env->debug(3, "In MultiJagged Jagged");
6022 
6023  {
6024  this->imbalance_tolerance = imbalance_tolerance_;
6025  this->num_global_parts = num_global_parts_;
6026  this->part_no_array = part_no_array_;
6027  this->recursion_depth = recursion_depth_;
6028 
6029  this->coord_dim = coord_dim_;
6030  this->num_local_coords = num_local_coords_;
6031  this->num_global_coords = num_global_coords_;
6032  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
6033  this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
6034 
6035  this->num_weights_per_coord = num_weights_per_coord_;
6036  this->mj_uniform_weights = mj_uniform_weights_;
6037  this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
6038  this->mj_uniform_parts = mj_uniform_parts_;
6039  this->mj_part_sizes = mj_part_sizes_;
6040 
6041  this->num_threads = 1;
6042 #ifdef HAVE_ZOLTAN2_OMP
6043 #pragma omp parallel
6044 
6045  {
6046  this->num_threads = omp_get_num_threads();
6047  }
6048 #endif
6049  }
6050  //this->set_input_data();
6051  this->set_part_specifications();
6052 
6053  this->allocate_set_work_memory();
6054 
6055  //We duplicate the comm as we create subcommunicators during migration.
6056  //We keep the problemComm as it is, while comm changes after each migration.
6057  this->comm = this->mj_problemComm->duplicate();
6058 
6059  //initially there is a single partition
6060  mj_part_t current_num_parts = 1;
6061  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6062 
6063  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6064 
6065  mj_part_t output_part_begin_index = 0;
6066  mj_part_t future_num_parts = this->total_num_part;
6067  bool is_data_ever_migrated = false;
6068 
6069  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
6070  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
6071  next_future_num_parts_in_parts->push_back(this->num_global_parts);
6072 
6073  RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
6074  RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
6075 
6076  compute_global_box();
6077  if(this->mj_keep_part_boxes){
6078  this->init_part_boxes(output_part_boxes);
6079  }
6080 
6081  for (int i = 0; i < this->recursion_depth; ++i){
6082  //partitioning array. size will be as the number of current partitions and this
6083  //holds how many parts that each part will be in the current dimension partitioning.
6084  std::vector <mj_part_t> num_partitioning_in_current_dim;
6085 
6086  //number of parts that will be obtained at the end of this partitioning.
6087  //future_num_part_in_parts is as the size of current number of parts.
6088  //holds how many more parts each should be divided in the further
6089  //iterations. this will be used to calculate num_partitioning_in_current_dim,
6090  //as the number of parts that the part will be partitioned
6091  //in the current dimension partitioning.
6092 
6093  //next_future_num_parts_in_parts will be as the size of outnumParts,
6094  //and this will hold how many more parts that each output part
6095  //should be divided. this array will also be used to determine the weight ratios
6096  //of the parts.
6097  //swap the arrays to use iteratively..
6098  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6099  future_num_part_in_parts = next_future_num_parts_in_parts;
6100  next_future_num_parts_in_parts = tmpPartVect;
6101 
6102  //clear next_future_num_parts_in_parts array as
6103  //getPartitionArrays expects it to be empty.
6104  //it also expects num_partitioning_in_current_dim to be empty as well.
6105  next_future_num_parts_in_parts->clear();
6106 
6107  if(this->mj_keep_part_boxes){
6108  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6109  input_part_boxes = output_part_boxes;
6110  output_part_boxes = tmpPartBoxes;
6111  output_part_boxes->clear();
6112  }
6113 
6114  //returns the total no. of output parts for this dimension partitioning.
6115  mj_part_t output_part_count_in_dimension =
6116  this->update_part_num_arrays(
6117  num_partitioning_in_current_dim,
6118  future_num_part_in_parts,
6119  next_future_num_parts_in_parts,
6120  future_num_parts,
6121  current_num_parts,
6122  i,
6123  input_part_boxes,
6124  output_part_boxes, 1);
6125 
6126  //if the number of obtained parts equal to current number of parts,
6127  //skip this dimension. For example, this happens when 1 is given in the input
6128  //part array is given. P=4,5,1,2
6129  if(output_part_count_in_dimension == current_num_parts) {
6130  //still need to swap the input output arrays.
6131  tmpPartVect= future_num_part_in_parts;
6132  future_num_part_in_parts = next_future_num_parts_in_parts;
6133  next_future_num_parts_in_parts = tmpPartVect;
6134 
6135  if(this->mj_keep_part_boxes){
6136  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6137  input_part_boxes = output_part_boxes;
6138  output_part_boxes = tmpPartBoxes;
6139  }
6140  continue;
6141  }
6142 
6143 
6144  //get the coordinate axis along which the partitioning will be done.
6145  int coordInd = i % this->coord_dim;
6146  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6147 
6148  //convert i to string to be used for debugging purposes.
6149  std::string istring = Teuchos::toString<int>(i);
6150  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6151 
6152  //alloc Memory to point the indices
6153  //of the parts in the permutation array.
6154  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6155 
6156  //the index where in the new_part_xadj will be written.
6157  mj_part_t output_part_index = 0;
6158  //whatever is written to output_part_index will be added with putput_coordinate_end_index
6159  //so that the points will be shifted.
6160  mj_part_t output_coordinate_end_index = 0;
6161 
6162  mj_part_t current_work_part = 0;
6163  mj_part_t current_concurrent_num_parts =
6164  std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6165 
6166  mj_part_t obtained_part_index = 0;
6167 
6168  //run for all available parts.
6169  for (; current_work_part < current_num_parts;
6170  current_work_part += current_concurrent_num_parts){
6171 
6172  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6173  this->max_concurrent_part_calculation);
6174 
6175  mj_part_t actual_work_part_count = 0;
6176  //initialization for 1D partitioning.
6177  //get the min and max coordinates of each part
6178  //together with the part weights of each part.
6179  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6180  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6181 
6182  //if this part wont be partitioned any further
6183  //dont do any work for this part.
6184  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6185  continue;
6186  }
6187  ++actual_work_part_count;
6188  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6189  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
6190 
6191 /*
6192  std::cout << "i:" << i << " j:" << current_work_part + kk
6193  << " coordinate_begin_index:" << coordinate_begin_index
6194  << " coordinate_end_index:" << coordinate_end_index
6195  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
6196  */
6197  this->mj_get_local_min_max_coord_totW(
6198  coordinate_begin_index,
6199  coordinate_end_index,
6200  this->coordinate_permutations,
6201  mj_current_dim_coords,
6202  this->process_local_min_max_coord_total_weight[kk], //min_coordinate
6203  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
6204  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
6205 
6206  }
6207 
6208  //1D partitioning
6209  if (actual_work_part_count > 0){
6210  //obtain global Min max of the part.
6211  this->mj_get_global_min_max_coord_totW(
6212  current_concurrent_num_parts,
6213  this->process_local_min_max_coord_total_weight,
6214  this->global_min_max_coord_total_weight);
6215 
6216  //represents the total number of cutlines
6217  //whose coordinate should be determined.
6218  mj_part_t total_incomplete_cut_count = 0;
6219 
6220  //Compute weight ratios for parts & cuts:
6221  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
6222  //part0 cut0 part1 cut1 part2 cut2 part3
6223  mj_part_t concurrent_part_cut_shift = 0;
6224  mj_part_t concurrent_part_part_shift = 0;
6225  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6226  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6227  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6228  current_concurrent_num_parts];
6229 
6230  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6231  2 * current_concurrent_num_parts];
6232 
6233  mj_part_t concurrent_current_part_index = current_work_part + kk;
6234 
6235  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6236 
6237  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6238  mj_scalar_t *current_target_part_weights = this->target_part_weights +
6239  concurrent_part_part_shift;
6240  //shift the usedCutCoordinate array as noCuts.
6241  concurrent_part_cut_shift += partition_count - 1;
6242  //shift the partRatio array as noParts.
6243  concurrent_part_part_shift += partition_count;
6244 
6245 
6246  //calculate only if part is not empty,
6247  //and part will be further partitioned.
6248  if(partition_count > 1 && min_coordinate <= max_coordinate){
6249 
6250  //increase num_cuts_do_be_determined by the number of cuts of the current
6251  //part's cut line number.
6252  total_incomplete_cut_count += partition_count - 1;
6253  //set the number of cut lines that should be determined
6254  //for this part.
6255  this->my_incomplete_cut_count[kk] = partition_count - 1;
6256 
6257  //get the target weights of the parts.
6258  this->mj_get_initial_cut_coords_target_weights(
6259  min_coordinate,
6260  max_coordinate,
6261  partition_count - 1,
6262  global_total_weight,
6263  usedCutCoordinate,
6264  current_target_part_weights,
6265  future_num_part_in_parts,
6266  next_future_num_parts_in_parts,
6267  concurrent_current_part_index,
6268  obtained_part_index);
6269 
6270  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6271  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
6272 
6273  //get the initial estimated part assignments of the
6274  //coordinates.
6275  this->set_initial_coordinate_parts(
6276  max_coordinate,
6277  min_coordinate,
6278  concurrent_current_part_index,
6279  coordinate_begin_index, coordinate_end_index,
6280  this->coordinate_permutations,
6281  mj_current_dim_coords,
6282  this->assigned_part_ids,
6283  partition_count);
6284  }
6285  else {
6286  // e.g., if have fewer coordinates than parts, don't need to do next dim.
6287  this->my_incomplete_cut_count[kk] = 0;
6288  }
6289  obtained_part_index += partition_count;
6290  }
6291 
6292 
6293 
6294  //used imbalance, it is always 0, as it is difficult to
6295  //estimate a range.
6296  mj_scalar_t used_imbalance = 0;
6297 
6298 
6299  // Determine cut lines for all concurrent parts parts here.
6300  this->mj_1D_part(
6301  mj_current_dim_coords,
6302  used_imbalance,
6303  current_work_part,
6304  current_concurrent_num_parts,
6305  current_cut_coordinates,
6306  total_incomplete_cut_count,
6307  num_partitioning_in_current_dim);
6308  }
6309 
6310  //create new part chunks
6311  {
6312  mj_part_t output_array_shift = 0;
6313  mj_part_t cut_shift = 0;
6314  size_t tlr_shift = 0;
6315  size_t partweight_array_shift = 0;
6316 
6317  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6318  mj_part_t current_concurrent_work_part = current_work_part + kk;
6319  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6320 
6321  //if the part is empty, skip the part.
6322  if((num_parts != 1 )
6323  &&
6324  this->global_min_max_coord_total_weight[kk] >
6325  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6326 
6327  //we still need to write the begin and end point of the
6328  //empty part. simply set it zero, the array indices will be shifted later.
6329  for(mj_part_t jj = 0; jj < num_parts; ++jj){
6330  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6331  }
6332  cut_shift += num_parts - 1;
6333  tlr_shift += (4 *(num_parts - 1) + 1);
6334  output_array_shift += num_parts;
6335  partweight_array_shift += (2 * (num_parts - 1) + 1);
6336  continue;
6337  }
6338 
6339  mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6340  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6341  current_concurrent_work_part -1];
6342  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6343  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6344  cut_shift;
6345 
6346  //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift;
6347 
6348  for(int ii = 0; ii < this->num_threads; ++ii){
6349  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6350  }
6351 
6352  if(num_parts > 1){
6353  if(this->mj_keep_part_boxes){
6354  //if part boxes are to be stored update the boundaries.
6355  for (mj_part_t j = 0; j < num_parts - 1; ++j){
6356  (*output_part_boxes)[output_array_shift + output_part_index +
6357  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6358  /*update max*/, coordInd);
6359 
6360  (*output_part_boxes)[output_array_shift + output_part_index + j +
6361  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6362  /*update min*/, coordInd);
6363  }
6364  }
6365 
6366  // Rewrite the indices based on the computed cuts.
6367  this->mj_create_new_partitions(
6368  num_parts,
6369  mj_current_dim_coords,
6370  current_concurrent_cut_coordinate,
6371  coordinate_begin,
6372  coordinate_end,
6373  used_local_cut_line_weight_to_left,
6374  this->thread_part_weight_work,
6375  this->new_part_xadj + output_part_index + output_array_shift
6376  );
6377 
6378  }
6379  else {
6380  //if this part is partitioned into 1 then just copy
6381  //the old values.
6382  mj_lno_t part_size = coordinate_end - coordinate_begin;
6383  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6384  memcpy(
6385  this->new_coordinate_permutations + coordinate_begin,
6386  this->coordinate_permutations + coordinate_begin,
6387  part_size * sizeof(mj_lno_t));
6388  }
6389  cut_shift += num_parts - 1;
6390  tlr_shift += (4 *(num_parts - 1) + 1);
6391  output_array_shift += num_parts;
6392  partweight_array_shift += (2 * (num_parts - 1) + 1);
6393  }
6394 
6395  //shift cut coordinates so that all cut coordinates are stored.
6396  //no shift now because we dont keep the cuts.
6397  //current_cut_coordinates += cut_shift;
6398 
6399  //mj_create_new_partitions from coordinates partitioned the parts and
6400  //write the indices as if there were a single part.
6401  //now we need to shift the beginning indices.
6402  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6403  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6404  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6405  //shift it by previousCount
6406  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6407  }
6408  //increase the previous count by current end.
6409  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6410  //increase the current out.
6411  output_part_index += num_parts ;
6412  }
6413  }
6414  }
6415  // end of this partitioning dimension
6416 
6417 
6418  int current_world_size = this->comm->getSize();
6419  long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6420 
6421 
6422  bool is_migrated_in_current_dimension = false;
6423 
6424  //we migrate if there are more partitionings to be done after this step
6425  //and if the migration is not forced to be avoided.
6426  //and the operation is not sequential.
6427  if (future_num_parts > 1 &&
6428  this->check_migrate_avoid_migration_option >= 0 &&
6429  current_world_size > 1){
6430 
6431  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6432  mj_part_t num_parts = output_part_count_in_dimension;
6433  if ( this->mj_perform_migration(
6434  num_parts,
6435  current_num_parts, //output
6436  next_future_num_parts_in_parts, //output
6437  output_part_begin_index,
6438  migration_reduce_all_population,
6439  this->num_local_coords / (future_num_parts * current_num_parts),
6440  istring,
6441  input_part_boxes, output_part_boxes) ) {
6442  is_migrated_in_current_dimension = true;
6443  is_data_ever_migrated = true;
6444  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
6445  istring);
6446  //since data is migrated, we reduce the number of reduceAll operations for the last part.
6447  this->total_dim_num_reduce_all /= num_parts;
6448  }
6449  else {
6450  is_migrated_in_current_dimension = false;
6451  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6452  }
6453  }
6454 
6455  //swap the coordinate permutations for the next dimension.
6456  mj_lno_t * tmp = this->coordinate_permutations;
6457  this->coordinate_permutations = this->new_coordinate_permutations;
6458  this->new_coordinate_permutations = tmp;
6459 
6460  if(!is_migrated_in_current_dimension){
6461  this->total_dim_num_reduce_all -= current_num_parts;
6462  current_num_parts = output_part_count_in_dimension;
6463  }
6464  freeArray<mj_lno_t>(this->part_xadj);
6465  this->part_xadj = this->new_part_xadj;
6466  this->new_part_xadj = NULL;
6467  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6468  }
6469 
6470  // Partitioning is done
6471  delete future_num_part_in_parts;
6472  delete next_future_num_parts_in_parts;
6473 
6474  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6476 
6477 
6478  //get the final parts of each initial coordinate
6479  //the results will be written to
6480  //this->assigned_part_ids for gnos given in this->current_mj_gnos
6481  this->set_final_parts(
6482  current_num_parts,
6483  output_part_begin_index,
6484  output_part_boxes,
6485  is_data_ever_migrated);
6486 
6487  result_assigned_part_ids_ = this->assigned_part_ids;
6488  result_mj_gnos_ = this->current_mj_gnos;
6489 
6490  this->free_work_memory();
6491  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
6492  this->mj_env->debug(3, "Out of MultiJagged");
6493 
6494 }
6495 
6496 
6500 template <typename Adapter>
6501 class Zoltan2_AlgMJ : public Algorithm<Adapter>
6502 {
6503 private:
6504 
6505 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6506 
6507  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
6508  typedef typename Adapter::scalar_t mj_scalar_t;
6509  typedef typename Adapter::gno_t mj_gno_t;
6510  typedef typename Adapter::lno_t mj_lno_t;
6511  typedef typename Adapter::node_t mj_node_t;
6512  typedef typename Adapter::part_t mj_part_t;
6514  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6515 #endif
6517 
6518  RCP<const Environment> mj_env; //the environment object
6519  RCP<const Comm<int> > mj_problemComm; //initial comm object
6520  RCP<const coordinateModel_t> mj_coords; //coordinate adapter
6521 
6522  //PARAMETERS
6523  double imbalance_tolerance; //input imbalance tolerance.
6524  size_t num_global_parts; //the targeted number of parts
6525  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
6526  int recursion_depth; //the number of steps that partitioning will be solved in.
6527 
6528  int coord_dim; // coordinate dimension.
6529  mj_lno_t num_local_coords; //number of local coords.
6530  mj_gno_t num_global_coords; //number of global coords.
6531  const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
6532  mj_scalar_t **mj_coordinates; //two dimension coordinate array
6533 
6534  int num_weights_per_coord; // number of weights per coordinate
6535  bool *mj_uniform_weights; //if the coordinates have uniform weights.
6536  mj_scalar_t **mj_weights; //two dimensional weight array
6537  bool *mj_uniform_parts; //if the target parts are uniform
6538  mj_scalar_t **mj_part_sizes; //target part weight sizes.
6539 
6540  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
6541  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
6542  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
6543  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
6544  //1 for minimized messages
6545  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
6546  bool mj_keep_part_boxes; //if the boxes need to be kept.
6547 
6548  int num_threads;
6549 
6550  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
6551  int mj_premigration_option;
6552  int min_coord_per_rank_for_premigration;
6553 
6554  ArrayRCP<mj_part_t> comXAdj_; //communication graph xadj
6555  ArrayRCP<mj_part_t> comAdj_; //communication graph adj.
6556 
6557 
6558  //when we have strided data, it returns a unstrided data in RCP form.
6559  //we need to hold on to that data, during the execution of mj, so that the data is not released.
6560  //coordinate_rcp_holder will hold that data, and release it when MJ is deleted.
6561  ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6562 
6563  void set_up_partitioning_data(
6564  const RCP<PartitioningSolution<Adapter> >&solution);
6565 
6566  void set_input_parameters(const Teuchos::ParameterList &p);
6567 
6568  void free_work_memory();
6569 
6570  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
6571 
6572  bool mj_premigrate_to_subset(int used_num_ranks, int migration_selection_option,
6573  RCP<const Environment> mj_env_,
6574  RCP<const Comm<int> > mj_problemComm_,
6575  int coord_dim_,
6576  mj_lno_t num_local_coords_,
6577  mj_gno_t num_global_coords_, size_t num_global_parts_,
6578  const mj_gno_t *initial_mj_gnos_,
6579  mj_scalar_t **mj_coordinates_,
6580  int num_weights_per_coord_,
6581  mj_scalar_t **mj_weights_,
6582  //results
6583  RCP<const Comm<int> > &result_problemComm_,
6584  mj_lno_t & result_num_local_coords_,
6585  mj_gno_t * &result_initial_mj_gnos_,
6586  mj_scalar_t ** &result_mj_coordinates_,
6587  mj_scalar_t ** &result_mj_weights_,
6588  int * &result_actual_owner_rank_);
6589 
6590 public:
6591 
6592  Zoltan2_AlgMJ(const RCP<const Environment> &env,
6593  RCP<const Comm<int> > &problemComm,
6594  const RCP<const coordinateModel_t> &coords) :
6595  mj_partitioner(), mj_env(env),
6596  mj_problemComm(problemComm),
6597  mj_coords(coords),
6598  imbalance_tolerance(0),
6599  num_global_parts(1), part_no_array(NULL),
6600  recursion_depth(0),
6601  coord_dim(0),num_local_coords(0), num_global_coords(0),
6602  initial_mj_gnos(NULL), mj_coordinates(NULL),
6603  num_weights_per_coord(0),
6604  mj_uniform_weights(NULL), mj_weights(NULL),
6605  mj_uniform_parts(NULL),
6606  mj_part_sizes(NULL),
6607  distribute_points_on_cut_lines(true),
6608  max_concurrent_part_calculation(1),
6609  check_migrate_avoid_migration_option(0), migration_type(0),
6610  minimum_migration_imbalance(0.30),
6611  mj_keep_part_boxes(false), num_threads(1), mj_run_as_rcb(false),mj_premigration_option(0), min_coord_per_rank_for_premigration(32000),
6612  comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6613  {}
6615  if (coordinate_ArrayRCP_holder != NULL){
6616  delete [] this->coordinate_ArrayRCP_holder;
6617  this->coordinate_ArrayRCP_holder = NULL;
6618  }
6619  }
6620 
6623  static void getValidParameters(ParameterList & pl)
6624  {
6625  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
6626  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6627  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
6628  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
6629  "algorithm. As many as the dimension count.", mj_parts_Validator);
6630 
6631  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
6632  "coordinates will be calculated concurently.", Environment::getAnyIntValidator());
6633 
6634  pl.set("mj_minimum_migration_imbalance", 1.1,
6635  "mj_minimum_migration_imbalance, the minimum imbalance of the "
6636  "processors to avoid migration",
6638 
6639  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6640  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6641  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
6642  "depending on the imbalance, 1 for forcing migration, 2 for "
6643  "avoiding migration", mj_migration_option_validator);
6644 
6645 
6646 
6647 
6648  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6649  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6650  pl.set("mj_migration_type", 0, "Migration type, 0 for migration to minimize the imbalance "
6651  "1 for migration to minimize messages exchanged the migration." ,
6652  mj_migration_option_validator);
6653 
6654  // bool parameter
6655  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
6656  "geometric partitioning.", Environment::getBoolValidator());
6657 
6658  // bool parameter
6659  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
6661 
6662  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
6663  "greater than 0.", Environment::getAnyIntValidator());
6664 
6665  RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6666  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6667 
6668  pl.set("mj_premigration_option", 0, "Whether to do premigration or not. 0 for no migration "
6669  "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6670  , mj_premigration_option_validator);
6671 
6672  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to assign each rank in multijagged after premigration"
6674 
6675  }
6676 
6683  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
6684 
6685  mj_partBoxVector_t &getPartBoxesView() const
6686  {
6687  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6688  return *pBoxes;
6689  }
6690 
6691  mj_part_t pointAssign(int dim, mj_scalar_t *point) const;
6692 
6693  void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6694  size_t &nPartsFound, mj_part_t **partsFound) const;
6695 
6696 
6699  void getCommunicationGraph(
6700  const PartitioningSolution<Adapter> *solution,
6701  ArrayRCP<mj_part_t> &comXAdj,
6702  ArrayRCP<mj_part_t> &comAdj);
6703 };
6704 
6705 
6706 
6707 
6708 template <typename Adapter>
6709 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset( int used_num_ranks,
6710  int migration_selection_option,
6711  RCP<const Environment> mj_env_,
6712  RCP<const Comm<int> > mj_problemComm_,
6713  int coord_dim_,
6714  mj_lno_t num_local_coords_,
6715  mj_gno_t num_global_coords_, size_t num_global_parts_,
6716  const mj_gno_t *initial_mj_gnos_,
6717  mj_scalar_t **mj_coordinates_,
6718  int num_weights_per_coord_,
6719  mj_scalar_t **mj_weights_,
6720  //results
6721  RCP<const Comm<int> > &result_problemComm_,
6722  mj_lno_t &result_num_local_coords_,
6723  mj_gno_t * &result_initial_mj_gnos_,
6724  mj_scalar_t ** &result_mj_coordinates_,
6725  mj_scalar_t ** &result_mj_weights_,
6726  int * &result_actual_owner_rank_){
6727  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
6728 
6729 
6730  int myRank = mj_problemComm_->getRank();
6731  int worldSize = mj_problemComm_->getSize();
6732 
6733  mj_part_t groupsize = worldSize / used_num_ranks;
6734 
6735  //std::cout << "used_num_ranks:" << used_num_ranks << " groupsize:" << groupsize << std::endl;
6736 
6737  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
6738 
6739  mj_part_t i_am_sending_to = 0;
6740  bool am_i_a_reciever = false;
6741 
6742  for(int i = 0; i < used_num_ranks; ++i){
6743  group_begins[i+ 1] = group_begins[i] + groupsize;
6744  if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
6745  if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
6746  if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
6747  if (myRank == group_begins[i]) am_i_a_reciever= true;
6748  }
6749 
6750  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
6751  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
6752 
6753 
6754  Tpetra::Distributor distributor(mj_problemComm_);
6755 
6756  std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
6757  ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
6758  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6759  result_num_local_coords_ = num_incoming_gnos;
6760  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
6761 
6762  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
6763 
6764  //migrate gnos.
6765  {
6766  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6767 
6768  ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
6769  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6770 
6771  result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
6772  memcpy(
6773  result_initial_mj_gnos_,
6774  received_gnos.getRawPtr(),
6775  num_incoming_gnos * sizeof(mj_gno_t));
6776  }
6777 
6778  //migrate coordinates
6779  result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
6780  for (int i = 0; i < coord_dim_; ++i){
6781  ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
6782  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6783  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
6784  result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6785  memcpy(
6786  result_mj_coordinates_[i],
6787  received_coord.getRawPtr(),
6788  num_incoming_gnos * sizeof(mj_scalar_t));
6789  }
6790 
6791  result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
6792  //migrate weights.
6793  for (int i = 0; i < num_weights_per_coord_; ++i){
6794  ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
6795  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6796  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
6797  result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6798  memcpy(
6799  result_mj_weights_[i],
6800  received_weight.getRawPtr(),
6801  num_incoming_gnos * sizeof(mj_scalar_t));
6802  }
6803 
6804  //migrate the owners of the coordinates
6805  {
6806  std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
6807  ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
6808  ArrayRCP<int> received_owners(num_incoming_gnos);
6809  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
6810  result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
6811  memcpy(
6812  result_actual_owner_rank_,
6813  received_owners.getRawPtr(),
6814  num_incoming_gnos * sizeof(int));
6815  }
6816  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
6817  return am_i_a_reciever;
6818 }
6819 
6820 
6821 
6822 
6823 
6824 
6825 
6835 template <typename Adapter>
6837  const RCP<PartitioningSolution<Adapter> > &solution
6838 )
6839 {
6840  this->set_up_partitioning_data(solution);
6841  this->set_input_parameters(this->mj_env->getParameters());
6842  if (this->mj_keep_part_boxes){
6843  this->mj_partitioner.set_to_keep_part_boxes();
6844  }
6845  this->mj_partitioner.set_partitioning_parameters(
6846  this->distribute_points_on_cut_lines,
6847  this->max_concurrent_part_calculation,
6848  this->check_migrate_avoid_migration_option,
6849  this->minimum_migration_imbalance, this->migration_type);
6850 
6851 
6852  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
6853  mj_lno_t result_num_local_coords = this->num_local_coords;
6854  mj_gno_t * result_initial_mj_gnos = NULL;
6855  mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
6856  mj_scalar_t **result_mj_weights = this->mj_weights;
6857  int *result_actual_owner_rank = NULL;
6858  const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
6859 
6860  //TODO: MD 08/2017: Further discussion is required.
6861  //MueLu calls MJ when it has very few coordinates per processors, such as 10.
6862  //For example, it begins with 1K processor with 1K coordinate in each.
6863  //Then with coarsening this reduces to 10 coordinate per procesor.
6864  //It calls MJ to repartition these to 10 coordinates.
6865  //MJ runs with 1K processor, 10 coordinate in each, and partitions to 10 parts.
6866  //As expected strong scaling is problem here, because computation is almost 0, and
6867  //communication cost of MJ linearly increases.
6868  //Premigration option gathers the coordinates to 10 parts before MJ starts
6869  //therefore MJ will run with a smalller subset of the problem.
6870  //Below, I am migrating the coordinates if mj_premigration_option is set,
6871  //and the result parts are less than the current part count, and the average number of
6872  //local coordinates is less than some threshold.
6873  //For example, premigration may not help if 1000 processors are partitioning data to 10,
6874  //but each of them already have 1M coordinate. In that case, we premigration would not help.
6875  int current_world_size = this->mj_problemComm->getSize();
6876  mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
6877  bool is_pre_migrated = false;
6878  bool am_i_in_subset = true;
6879  if ( mj_premigration_option > 0 &&
6880  size_t (current_world_size) > this->num_global_parts &&
6881  this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
6882  if (this->mj_keep_part_boxes){
6883  throw std::logic_error("Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
6884  }
6885  is_pre_migrated =true;
6886  int migration_selection_option = mj_premigration_option;
6887  if(migration_selection_option * this->num_global_parts > (size_t) (current_world_size)){
6888  migration_selection_option = current_world_size / this->num_global_parts;
6889  }
6890  int used_num_ranks = int (this->num_global_coords / float (threshold_num_local_coords) + 0.5);
6891  if (used_num_ranks == 0) used_num_ranks = 1;
6892 
6893  am_i_in_subset = this->mj_premigrate_to_subset(
6894  used_num_ranks,
6895  migration_selection_option,
6896  this->mj_env,
6897  this->mj_problemComm,
6898  this->coord_dim,
6899  this->num_local_coords,
6900  this->num_global_coords,
6901  this->num_global_parts,
6902  this->initial_mj_gnos,
6903  this->mj_coordinates,
6904  this->num_weights_per_coord,
6905  this->mj_weights,
6906  //results
6907  result_problemComm,
6908  result_num_local_coords,
6909  result_initial_mj_gnos,
6910  result_mj_coordinates,
6911  result_mj_weights,
6912  result_actual_owner_rank);
6913  result_initial_mj_gnos_ = result_initial_mj_gnos;
6914  }
6915 
6916 
6917 
6918  mj_part_t *result_assigned_part_ids = NULL;
6919  mj_gno_t *result_mj_gnos = NULL;
6920 
6921  if (am_i_in_subset){
6922  this->mj_partitioner.multi_jagged_part(
6923  this->mj_env,
6924  result_problemComm, //this->mj_problemComm,
6925 
6926  this->imbalance_tolerance,
6927  this->num_global_parts,
6928  this->part_no_array,
6929  this->recursion_depth,
6930 
6931  this->coord_dim,
6932  result_num_local_coords, //this->num_local_coords,
6933  this->num_global_coords,
6934  result_initial_mj_gnos_, //this->initial_mj_gnos,
6935  result_mj_coordinates, //this->mj_coordinates,
6936 
6937  this->num_weights_per_coord,
6938  this->mj_uniform_weights,
6939  result_mj_weights, //this->mj_weights,
6940  this->mj_uniform_parts,
6941  this->mj_part_sizes,
6942 
6943  result_assigned_part_ids,
6944  result_mj_gnos
6945  );
6946 
6947  }
6948 
6949  // Reorder results so that they match the order of the input
6950 
6951 #if defined(__cplusplus) && __cplusplus >= 201103L
6952  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6953  localGidToLid.reserve(result_num_local_coords);
6954  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6955  localGidToLid[result_initial_mj_gnos_[i]] = i;
6956  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
6957  0, result_num_local_coords, true);
6958 
6959  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6960  mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6961  partId[origLID] = result_assigned_part_ids[i];
6962  }
6963 
6964 #else
6965  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6966  localGidToLid(result_num_local_coords);
6967  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6968  localGidToLid.put(result_initial_mj_gnos_[i], i);
6969 
6970  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
6971  0, result_num_local_coords, true);
6972 
6973  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6974  mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6975  partId[origLID] = result_assigned_part_ids[i];
6976  }
6977 
6978 #endif // C++11 is enabled
6979 
6980  delete [] result_mj_gnos;
6981  delete [] result_assigned_part_ids;
6982 
6983 
6984  //now the results are reordered. but if premigration occured,
6985  //then we need to send these ids to actual owners again.
6986  if (is_pre_migrated){
6987  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
6988  Tpetra::Distributor distributor(this->mj_problemComm);
6989 
6990  ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
6991  mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
6992  if (num_incoming_gnos != this->num_local_coords){
6993  throw std::logic_error("Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
6994  }
6995  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
6996  mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
6997  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6998  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
6999  {
7000  ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7001  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7002  }
7003  {
7004  ArrayView<mj_part_t> sent_partnos(partId());
7005  distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7006  }
7007  partId = arcp(new mj_part_t[this->num_local_coords],
7008  0, this->num_local_coords, true);
7009 
7010  {
7011 #if defined(__cplusplus) && __cplusplus >= 201103L
7012  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7013  localGidToLid2.reserve(this->num_local_coords);
7014  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7015  localGidToLid2[this->initial_mj_gnos[i]] = i;
7016 
7017 
7018  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7019  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7020  partId[origLID] = received_partids[i];
7021  }
7022 
7023 #else
7024  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7025  localGidToLid2(this->num_local_coords);
7026  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7027  localGidToLid2.put(this->initial_mj_gnos[i], i);
7028 
7029 
7030  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7031  mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7032  partId[origLID] = received_partids[i];
7033  }
7034 
7035 #endif // C++11 is enabled
7036 
7037  }
7038 
7039  {
7040  freeArray<mj_gno_t> (result_initial_mj_gnos);
7041  for (int i = 0; i < this->coord_dim; ++i){
7042  freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7043  }
7044  freeArray<mj_scalar_t *> (result_mj_coordinates);
7045 
7046  for (int i = 0; i < this->num_weights_per_coord; ++i){
7047  freeArray<mj_scalar_t> (result_mj_weights[i]);
7048  }
7049  freeArray<mj_scalar_t *> (result_mj_weights);
7050  freeArray<int> (result_actual_owner_rank);
7051  }
7052  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
7053 
7054  }
7055 
7056  solution->setParts(partId);
7057  this->free_work_memory();
7058 }
7059 
7060 /* \brief Freeing the memory allocated.
7061  * */
7062 template <typename Adapter>
7064  freeArray<mj_scalar_t *>(this->mj_coordinates);
7065  freeArray<mj_scalar_t *>(this->mj_weights);
7066  freeArray<bool>(this->mj_uniform_parts);
7067  freeArray<mj_scalar_t *>(this->mj_part_sizes);
7068  freeArray<bool>(this->mj_uniform_weights);
7069 
7070 }
7071 
7072 /* \brief Sets the partitioning data for multijagged algorithm.
7073  * */
7074 template <typename Adapter>
7075 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7076  const RCP<PartitioningSolution<Adapter> > &solution
7077 )
7078 {
7079  this->coord_dim = this->mj_coords->getCoordinateDim();
7080  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7081  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7082  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7083  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7084 
7085  // From the Solution we get part information.
7086  // If the part sizes for a given criteria are not uniform,
7087  // then they are values that sum to 1.0.
7088  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7089  //allocate only two dimensional pointer.
7090  //raw pointer addresess will be obtained from multivector.
7091  this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7092  this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7093 
7094  //if the partitioning results are to be uniform.
7095  this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7096  //if in a criteria dimension, uniform part is false this shows ratios of
7097  //the target part weights.
7098  this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7099  //if the weights of coordinates are uniform in a criteria dimension.
7100  this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7101 
7102  typedef StridedData<mj_lno_t, mj_scalar_t> input_t;
7103  ArrayView<const mj_gno_t> gnos;
7104  ArrayView<input_t> xyz;
7105  ArrayView<input_t> wgts;
7106 
7107 
7108  this->coordinate_ArrayRCP_holder = new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7109 
7110  this->mj_coords->getCoordinates(gnos, xyz, wgts);
7111  //obtain global ids.
7112  ArrayView<const mj_gno_t> mj_gnos = gnos;
7113  this->initial_mj_gnos = mj_gnos.getRawPtr();
7114 
7115  //extract coordinates from multivector.
7116  for (int dim=0; dim < this->coord_dim; dim++){
7117  ArrayRCP<const mj_scalar_t> ar;
7118  xyz[dim].getInputArray(ar);
7119  this->coordinate_ArrayRCP_holder[dim] = ar;
7120 
7121  //multiJagged coordinate values assignment
7122  this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7123  }
7124 
7125  //if no weights are provided set uniform weight.
7126  if (this->num_weights_per_coord == 0){
7127  this->mj_uniform_weights[0] = true;
7128  this->mj_weights[0] = NULL;
7129  }
7130  else{
7131  //if weights are provided get weights for all weight indices
7132  for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7133  ArrayRCP<const mj_scalar_t> ar;
7134  wgts[wdim].getInputArray(ar);
7135  this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7136  this->mj_uniform_weights[wdim] = false;
7137  this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7138  }
7139  }
7140 
7141  for (int wdim = 0; wdim < criteria_dim; wdim++){
7142  if (solution->criteriaHasUniformPartSizes(wdim)){
7143  this->mj_uniform_parts[wdim] = true;
7144  this->mj_part_sizes[wdim] = NULL;
7145  }
7146  else{
7147  std::cerr << "MJ does not support non uniform target part weights" << std::endl;
7148  exit(1);
7149  }
7150  }
7151 }
7152 
7153 /* \brief Sets the partitioning parameters for multijagged algorithm.
7154  * \param pl: is the parameter list provided to zoltan2 call
7155  * */
7156 template <typename Adapter>
7157 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
7158 
7159  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
7160  if (pe){
7161  double tol;
7162  tol = pe->getValue(&tol);
7163  this->imbalance_tolerance = tol - 1.0;
7164  }
7165 
7166  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
7167  if (this->imbalance_tolerance <= 0)
7168  this->imbalance_tolerance= 10e-4;
7169 
7170  //if an input partitioning array is provided.
7171  this->part_no_array = NULL;
7172  //the length of the input partitioning array.
7173  this->recursion_depth = 0;
7174 
7175  if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
7176  this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
7177  this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
7178  this->mj_env->debug(2, "mj_parts provided by user");
7179  }
7180 
7181  //get mj specific parameters.
7182  this->distribute_points_on_cut_lines = true;
7183  this->max_concurrent_part_calculation = 1;
7184 
7185  this->mj_run_as_rcb = false;
7186  this->mj_premigration_option = 0;
7187  this->min_coord_per_rank_for_premigration = 32000;
7188 
7189  int mj_user_recursion_depth = -1;
7190  this->mj_keep_part_boxes = false;
7191  this->check_migrate_avoid_migration_option = 0;
7192  this->migration_type = 0;
7193  this->minimum_migration_imbalance = 0.35;
7194 
7195  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
7196  if (pe){
7197  double imb;
7198  imb = pe->getValue(&imb);
7199  this->minimum_migration_imbalance = imb - 1.0;
7200  }
7201 
7202  pe = pl.getEntryPtr("mj_migration_option");
7203  if (pe){
7204  this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7205  }else {
7206  this->check_migrate_avoid_migration_option = 0;
7207  }
7208  if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7209 
7211  pe = pl.getEntryPtr("mj_migration_type");
7212  if (pe){
7213  this->migration_type = pe->getValue(&this->migration_type);
7214  }else {
7215  this->migration_type = 0;
7216  }
7217  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
7219 
7220  pe = pl.getEntryPtr("mj_concurrent_part_count");
7221  if (pe){
7222  this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7223  }else {
7224  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
7225  }
7226 
7227  pe = pl.getEntryPtr("mj_keep_part_boxes");
7228  if (pe){
7229  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7230  }else {
7231  this->mj_keep_part_boxes = false; // Set to invalid value
7232  }
7233 
7234 
7235  // For now, need keep_part_boxes to do pointAssign and boxAssign.
7236  // pe = pl.getEntryPtr("keep_cuts");
7237  // if (pe){
7238  // int tmp = pe->getValue(&tmp);
7239  // if (tmp) this->mj_keep_part_boxes = true;
7240  // }
7241 
7242  //need to keep part boxes if mapping type is geometric.
7243  if (this->mj_keep_part_boxes == false){
7244  pe = pl.getEntryPtr("mapping_type");
7245  if (pe){
7246  int mapping_type = -1;
7247  mapping_type = pe->getValue(&mapping_type);
7248  if (mapping_type == 0){
7249  mj_keep_part_boxes = true;
7250  }
7251  }
7252  }
7253 
7254  //need to keep part boxes if mapping type is geometric.
7255  pe = pl.getEntryPtr("mj_enable_rcb");
7256  if (pe){
7257  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7258  }else {
7259  this->mj_run_as_rcb = false; // Set to invalid value
7260  }
7261 
7262  pe = pl.getEntryPtr("mj_premigration_option");
7263  if (pe){
7264  mj_premigration_option = pe->getValue(&mj_premigration_option);
7265  }else {
7266  mj_premigration_option = 0;
7267  }
7268 
7269  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
7270  if (pe){
7271  min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
7272  }else {
7273  min_coord_per_rank_for_premigration = 32000;
7274  }
7275 
7276  pe = pl.getEntryPtr("mj_recursion_depth");
7277  if (pe){
7278  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7279  }else {
7280  mj_user_recursion_depth = -1; // Set to invalid value
7281  }
7282 
7283  bool val = false;
7284  pe = pl.getEntryPtr("rectilinear");
7285  if (pe) val = pe->getValue(&val);
7286  if (val){
7287  this->distribute_points_on_cut_lines = false;
7288  } else {
7289  this->distribute_points_on_cut_lines = true;
7290  }
7291 
7292  if (this->mj_run_as_rcb){
7293  mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7294  }
7295  if (this->recursion_depth < 1){
7296  if (mj_user_recursion_depth > 0){
7297  this->recursion_depth = mj_user_recursion_depth;
7298  }
7299  else {
7300  this->recursion_depth = this->coord_dim;
7301  }
7302  }
7303 
7304  this->num_threads = 1;
7305 #ifdef HAVE_ZOLTAN2_OMP
7306 #pragma omp parallel
7307  {
7308  this->num_threads = omp_get_num_threads();
7309  }
7310 #endif
7311 
7312 }
7313 
7315 template <typename Adapter>
7317  int dim,
7318  typename Adapter::scalar_t *lower,
7319  typename Adapter::scalar_t *upper,
7320  size_t &nPartsFound,
7321  typename Adapter::part_t **partsFound) const
7322 {
7323  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7324  // TODO: complexity. Or at least do a search through the boxes, using
7325  // TODO: p x q x r x ... if possible.
7326 
7327  nPartsFound = 0;
7328  *partsFound = NULL;
7329 
7330  if (this->mj_keep_part_boxes) {
7331 
7332  // Get vector of part boxes
7333  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7334 
7335  size_t nBoxes = (*partBoxes).size();
7336  if (nBoxes == 0) {
7337  throw std::logic_error("no part boxes exist");
7338  }
7339 
7340  // Determine whether the box overlaps the globalBox at all
7341  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7342 
7343  if (globalBox->boxesOverlap(dim, lower, upper)) {
7344 
7345  std::vector<typename Adapter::part_t> partlist;
7346 
7347  // box overlaps the global box; find specific overlapping boxes
7348  for (size_t i = 0; i < nBoxes; i++) {
7349  try {
7350  if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7351  nPartsFound++;
7352  partlist.push_back((*partBoxes)[i].getpId());
7353 
7354 // std::cout << "Given box (";
7355 // for (int j = 0; j < dim; j++)
7356 // std::cout << lower[j] << " ";
7357 // std::cout << ") x (";
7358 // for (int j = 0; j < dim; j++)
7359 // std::cout << upper[j] << " ";
7360 // std::cout << ") overlaps PartBox "
7361 // << (*partBoxes)[i].getpId() << " (";
7362 // for (int j = 0; j < dim; j++)
7363 // std::cout << (*partBoxes)[i].getlmins()[j] << " ";
7364 // std::cout << ") x (";
7365 // for (int j = 0; j < dim; j++)
7366 // std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
7367 // std::cout << ")" << std::endl;
7368  }
7369  }
7371  }
7372  if (nPartsFound) {
7373  *partsFound = new mj_part_t[nPartsFound];
7374  for (size_t i = 0; i < nPartsFound; i++)
7375  (*partsFound)[i] = partlist[i];
7376  }
7377  }
7378  else {
7379  // Box does not overlap the domain at all. Find the closest part
7380  // Not sure how to perform this operation for MJ without having the
7381  // cuts. With the RCB cuts, the concept of a part extending to
7382  // infinity was natural. With the boxes, it is much more difficult.
7383  // TODO: For now, return information indicating NO OVERLAP.
7384 
7385  }
7386  }
7387  else {
7388  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
7389  }
7390 }
7391 
7393 template <typename Adapter>
7395  int dim,
7396  typename Adapter::scalar_t *point) const
7397 {
7398 
7399  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7400  // TODO: complexity. Or at least do a search through the boxes, using
7401  // TODO: p x q x r x ... if possible.
7402 
7403  if (this->mj_keep_part_boxes) {
7404  typename Adapter::part_t foundPart = -1;
7405 
7406  // Get vector of part boxes
7407  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7408 
7409  size_t nBoxes = (*partBoxes).size();
7410  if (nBoxes == 0) {
7411  throw std::logic_error("no part boxes exist");
7412  }
7413 
7414  // Determine whether the point is within the global domain
7415  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7416 
7417  if (globalBox->pointInBox(dim, point)) {
7418 
7419  // point is in the global domain; determine in which part it is.
7420  size_t i;
7421  for (i = 0; i < nBoxes; i++) {
7422  try {
7423  if ((*partBoxes)[i].pointInBox(dim, point)) {
7424  foundPart = (*partBoxes)[i].getpId();
7425 // std::cout << "Point (";
7426 // for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
7427 // std::cout << ") found in box " << i << " part " << foundPart
7428 // << std::endl;
7429 // (*partBoxes)[i].print();
7430  break;
7431  }
7432  }
7434  }
7435 
7436  if (i == nBoxes) {
7437  // This error should never occur
7438  std::ostringstream oss;
7439  oss << "Point (";
7440  for (int j = 0; j < dim; j++) oss << point[j] << " ";
7441  oss << ") not found in domain";
7442  throw std::logic_error(oss.str());
7443  }
7444  }
7445 
7446  else {
7447  // Point is outside the global domain.
7448  // Determine to which part it is closest.
7449  // TODO: with cuts, would not need this special case
7450 
7451  size_t closestBox = 0;
7452  mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
7453  mj_scalar_t *centroid = new mj_scalar_t[dim];
7454  for (size_t i = 0; i < nBoxes; i++) {
7455  (*partBoxes)[i].computeCentroid(centroid);
7456  mj_scalar_t sum = 0.;
7457  mj_scalar_t diff;
7458  for (int j = 0; j < dim; j++) {
7459  diff = centroid[j] - point[j];
7460  sum += diff * diff;
7461  }
7462  if (sum < minDistance) {
7463  minDistance = sum;
7464  closestBox = i;
7465  }
7466  }
7467  foundPart = (*partBoxes)[closestBox].getpId();
7468  delete [] centroid;
7469  }
7470 
7471  return foundPart;
7472  }
7473  else {
7474  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
7475  }
7476 }
7477 
7478 template <typename Adapter>
7480  const PartitioningSolution<Adapter> *solution,
7481  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
7482  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
7483 {
7484  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7485  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7486  mj_part_t ntasks = (*pBoxes).size();
7487  int dim = (*pBoxes)[0].getDim();
7488  GridHash<mj_scalar_t, mj_part_t> grid(pBoxes, ntasks, dim);
7489  grid.getAdjArrays(comXAdj_, comAdj_);
7490  }
7491  comAdj = comAdj_;
7492  comXAdj = comXAdj_;
7493 }
7494 
7495 
7496 template <typename Adapter>
7497 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7499 {
7500  return this->mj_partitioner.get_kept_boxes();
7501 }
7502 
7503 
7504 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7505  typename mj_part_t>
7506 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7508 {
7509  if (this->mj_keep_part_boxes)
7510  return this->kept_boxes;
7511  else
7512  throw std::logic_error("Error: part boxes are not stored.");
7513 }
7514 
7515 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7516  typename mj_part_t>
7517 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7519  RCP<mj_partBoxVector_t> &localPartBoxes
7520 ) const
7521 {
7522  mj_part_t ntasks = this->num_global_parts;
7523  int dim = (*localPartBoxes)[0].getDim();
7524  mj_scalar_t *localPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
7525 
7526  memset(localPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
7527 
7528  mj_scalar_t *globalPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
7529  memset(globalPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
7530 
7531  mj_scalar_t *localPartMins = localPartBoundaries;
7532  mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7533 
7534  mj_scalar_t *globalPartMins = globalPartBoundaries;
7535  mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7536 
7537  mj_part_t boxCount = localPartBoxes->size();
7538  for (mj_part_t i = 0; i < boxCount; ++i){
7539  mj_part_t pId = (*localPartBoxes)[i].getpId();
7540  //std::cout << "me:" << comm->getRank() << " has:" << pId << std::endl;
7541 
7542  mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
7543  mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7544 
7545  for (int j = 0; j < dim; ++j){
7546  localPartMins[dim * pId + j] = lmins[j];
7547  localPartMaxs[dim * pId + j] = lmaxs[j];
7548  /*
7549  std::cout << "me:" << comm->getRank() <<
7550  " dim * pId + j:"<< dim * pId + j <<
7551  " localMin:" << localPartMins[dim * pId + j] <<
7552  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
7553  */
7554  }
7555  }
7556 
7557  Teuchos::Zoltan2_BoxBoundaries<int, mj_scalar_t> reductionOp(ntasks * 2 *dim);
7558 
7559  reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
7560  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7561  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
7562  for (mj_part_t i = 0; i < ntasks; ++i){
7564  globalPartMins + dim * i,
7565  globalPartMaxs + dim * i);
7566 
7567  /*
7568  for (int j = 0; j < dim; ++j){
7569  std::cout << "me:" << comm->getRank() <<
7570  " dim * pId + j:"<< dim * i + j <<
7571  " globalMin:" << globalPartMins[dim * i + j] <<
7572  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
7573  }
7574  */
7575  pB->push_back(tpb);
7576  }
7577  delete []localPartBoundaries;
7578  delete []globalPartBoundaries;
7579  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
7580  return pB;
7581 }
7582 } // namespace Zoltan2
7583 
7584 #endif
ZOLTAN2_ALGMULTIJAGGED_SWAP
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
Definition: Zoltan2_AlgMultiJagged.hpp:107
Zoltan2::allocMemory
T * allocMemory(size_t size)
Allocates memory for the given size.
Definition: Zoltan2_AlgMultiJagged.hpp:156
Zoltan2::AlgMJ::get_global_box
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
Definition: Zoltan2_AlgMultiJagged.hpp:1950
Z2_FORWARD_EXCEPTIONS
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Definition: Zoltan2_Exceptions.hpp:106
Zoltan2::uqsort
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Definition: Zoltan2_AlgMultiJagged.hpp:296
Zoltan2::CoordinateModel
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
Definition: Zoltan2_CoordinateModel.hpp:71
SIGNIFICANCE_MUL
#define SIGNIFICANCE_MUL
Definition: Zoltan2_AlgMultiJagged.hpp:86
imbalanceOf2
#define imbalanceOf2(Wachieved, wExpected)
Definition: Zoltan2_AlgMultiJagged.hpp:103
Zoltan2::uMultiSortItem::set
void set(IT index_, CT count_, WT *vals_)
Definition: Zoltan2_AlgMultiJagged.hpp:226
Zoltan2::uMultiSortItem::operator>
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
Definition: Zoltan2_AlgMultiJagged.hpp:259
Z2_ABS
#define Z2_ABS(x)
Definition: Zoltan2_CoordinatePartitioningGraph.hpp:64
Zoltan2::AlgMJ::get_kept_boxes
RCP< mj_partBoxVector_t > get_kept_boxes() const
Definition: Zoltan2_AlgMultiJagged.hpp:7507
Zoltan2::PartitioningSolution
A PartitioningSolution is a solution to a partitioning problem.
Definition: Zoltan2_PartitioningSolution.hpp:55
Zoltan2::coordinateModelPartBox
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Definition: Zoltan2_CoordinatePartitioningGraph.hpp:70
Zoltan2::Environment::getAnyDoubleValidator
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
Definition: Zoltan2_Environment.cpp:158
Zoltan2::Zoltan2_AlgMJ
Multi Jagged coordinate partitioning algorithm.
Definition: Zoltan2_AlgMultiJagged.hpp:6501
Zoltan2::Zoltan2_AlgMJ::pointAssign
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
Definition: Zoltan2_AlgMultiJagged.hpp:7394
Zoltan2::AlgMJ::set_partitioning_parameters
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
Definition: Zoltan2_AlgMultiJagged.hpp:5913
Teuchos::Zoltan2_BoxBoundaries::reduce
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Definition: Zoltan2_AlgMultiJagged.hpp:139
Zoltan2_IntegerRangeList.hpp
Define IntegerRangeList validator.
Zoltan2::uSignedSortItem::id
IT id
Definition: Zoltan2_AlgMultiJagged.hpp:382
Zoltan2::MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
Definition: Zoltan2_Parameters.hpp:120
Zoltan2::Environment::getBoolValidator
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
Definition: Zoltan2_Environment.cpp:151
Z2_ASSERT_VALUE
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
Definition: Zoltan2_Exceptions.hpp:170
Zoltan2_CoordinatePartitioningGraph.hpp
Zoltan2::GridHash
GridHash Class, Hashing Class for part boxes.
Definition: Zoltan2_CoordinatePartitioningGraph.hpp:502
ZOLTAN2_ABS
#define ZOLTAN2_ABS(x)
Definition: Zoltan2_AlgMultiJagged.hpp:99
Zoltan2::uSortItem
Sort items for quick sort function.
Definition: Zoltan2_AlgMultiJagged.hpp:285
xml2dox.root
root
Definition: xml2dox.py:168
Zoltan2::uMultiSortItem::index
volatile IT index
Definition: Zoltan2_AlgMultiJagged.hpp:194
Zoltan2::uMultiSortItem::uMultiSortItem
uMultiSortItem()
Definition: Zoltan2_AlgMultiJagged.hpp:200
Zoltan2::uSortItem::id
IT id
Definition: Zoltan2_AlgMultiJagged.hpp:287
Zoltan2::uMultiSortItem::count
volatile CT count
Definition: Zoltan2_AlgMultiJagged.hpp:195
xml2dox.vals
dictionary vals
Definition: xml2dox.py:186
Zoltan2::Zoltan2_AlgMJ::getPartBoxesView
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Definition: Zoltan2_AlgMultiJagged.hpp:6685
Zoltan2::uMultiSortItem::val
volatile WT * val
Definition: Zoltan2_AlgMultiJagged.hpp:197
part_t
SparseMatrixAdapter_t::part_t part_t
Definition: partitioningTree.cpp:74
Zoltan2::uSignedSortItem::operator>=
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
Definition: Zoltan2_AlgMultiJagged.hpp:439
Zoltan2::uSortItem::val
WT val
Definition: Zoltan2_AlgMultiJagged.hpp:289
Zoltan2_CoordinateModel.hpp
Defines the CoordinateModel classes.
Zoltan2::uMultiSortItem::~uMultiSortItem
~uMultiSortItem()
Definition: Zoltan2_AlgMultiJagged.hpp:222
Zoltan2::AlgMJ
Multi Jagged coordinate partitioning algorithm.
Definition: Zoltan2_AlgMultiJagged.hpp:532
Zoltan2::uSignedSortItem::operator<
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Definition: Zoltan2_AlgMultiJagged.hpp:386
Zoltan2::Algorithm
Algorithm defines the base class for all algorithms.
Definition: Zoltan2_Algorithm.hpp:55
Zoltan2::uMultiSortItem::operator=
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
Definition: Zoltan2_AlgMultiJagged.hpp:233
Zoltan2::uqSignsort
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
Definition: Zoltan2_AlgMultiJagged.hpp:447
epsilon
#define epsilon
Definition: partition2DMatrix.cpp:97
Zoltan2::global_size_t
Tpetra::global_size_t global_size_t
Definition: Zoltan2_Standards.hpp:119
Zoltan2::uSignedSortItem::val
WT val
Definition: Zoltan2_AlgMultiJagged.hpp:384
Zoltan2::Zoltan2_AlgMJ::~Zoltan2_AlgMJ
~Zoltan2_AlgMJ()
Definition: Zoltan2_AlgMultiJagged.hpp:6614
Zoltan2::uSignedSortItem::signbit
SIGN signbit
Definition: Zoltan2_AlgMultiJagged.hpp:385
Zoltan2::AlgMJ::compute_global_box_boundaries
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
Definition: Zoltan2_AlgMultiJagged.hpp:7518
Zoltan2::uMultiSortItem::uMultiSortItem
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
Definition: Zoltan2_AlgMultiJagged.hpp:215
Zoltan2::Environment::getAnyIntValidator
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
Definition: Zoltan2_Environment.cpp:169
FUTURE_REDUCEALL_CUTOFF
#define FUTURE_REDUCEALL_CUTOFF
Definition: Zoltan2_AlgMultiJagged.hpp:91
Teuchos::MultiJaggedCombinedReductionOp
Definition: Zoltan2_MultiJagged_ReductionOps.hpp:57
Z2_THROW_OUTSIDE_ERROR
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Definition: Zoltan2_Exceptions.hpp:64
Zoltan2::IntegerRangeListValidator
A ParameterList validator for integer range lists.
Definition: Zoltan2_IntegerRangeList.hpp:83
Zoltan2::Zoltan2_AlgMJ::getCommunicationGraph
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
Definition: Zoltan2_AlgMultiJagged.hpp:7479
Zoltan2::uMultiSortItem::uMultiSortItem
uMultiSortItem(IT index_, CT count_, WT *vals_)
Definition: Zoltan2_AlgMultiJagged.hpp:208
LEAST_SIGNIFICANCE
#define LEAST_SIGNIFICANCE
Definition: Zoltan2_AlgMultiJagged.hpp:85
Zoltan2::uSignedSortItem
Definition: Zoltan2_AlgMultiJagged.hpp:380
Teuchos::Zoltan2_BoxBoundaries::Zoltan2_BoxBoundaries
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Definition: Zoltan2_AlgMultiJagged.hpp:134
Zoltan2::AlgMJ::AlgMJ
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
Definition: Zoltan2_AlgMultiJagged.hpp:1904
Zoltan2::AlgMJ::sequential_task_partitioning
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
Definition: Zoltan2_AlgMultiJagged.hpp:1397
Teuchos::MultiJaggedCombinedMinMaxTotalReductionOp
Definition: Zoltan2_MultiJagged_ReductionOps.hpp:133
Zoltan2::AlgMJ::multi_jagged_part
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
Definition: Zoltan2_AlgMultiJagged.hpp:5960
Zoltan2::uSignedSortItem::operator>
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Definition: Zoltan2_AlgMultiJagged.hpp:412
MIN_WORK_LAST_DIM
#define MIN_WORK_LAST_DIM
Definition: Zoltan2_AlgMultiJagged.hpp:94
Zoltan2::uMultiSortItem::operator<
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
Definition: Zoltan2_AlgMultiJagged.hpp:240
Teuchos::Zoltan2_BoxBoundaries::Zoltan2_BoxBoundaries
Zoltan2_BoxBoundaries()
Default Constructor.
Definition: Zoltan2_AlgMultiJagged.hpp:126
Zoltan2::Zoltan2_AlgMJ::partition
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
Definition: Zoltan2_AlgMultiJagged.hpp:6836
Zoltan2::freeArray
void freeArray(T *&array)
Frees the given array.
Definition: Zoltan2_AlgMultiJagged.hpp:173
Teuchos::Zoltan2_BoxBoundaries
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
Definition: Zoltan2_AlgMultiJagged.hpp:117
weights
static ArrayRCP< ArrayRCP< zscalar_t > > weights
Definition: rcbPerformanceZ1.cpp:82
Zoltan2
Definition: Zoltan2_AlgSerialGreedy.hpp:56
Zoltan2::uMultiSortItem::_EPSILON
volatile WT _EPSILON
Definition: Zoltan2_AlgMultiJagged.hpp:198
Zoltan2::uMultiSortItem
Class for sorting items with multiple values. First sorting with respect to val[0],...
Definition: Zoltan2_AlgMultiJagged.hpp:189
Zoltan2::GridHash::getAdjArrays
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Definition: Zoltan2_CoordinatePartitioningGraph.hpp:610
Teuchos
Definition: Zoltan2_AlgMultiJagged.hpp:110
Zoltan2_Parameters.hpp
Defines Parameter related enumerators, declares functions.
Zoltan2_Algorithm.hpp
Zoltan2::Zoltan2_AlgMJ::getValidParameters
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
Definition: Zoltan2_AlgMultiJagged.hpp:6623
Zoltan2::Zoltan2_AlgMJ::Zoltan2_AlgMJ
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Definition: Zoltan2_AlgMultiJagged.hpp:6592
Zoltan2_Util.hpp
A gathering of useful namespace methods.
Zoltan2::AlgMJ::set_to_keep_part_boxes
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Definition: Zoltan2_AlgMultiJagged.hpp:1960
Zoltan2::Zoltan2_AlgMJ::boxAssign
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
Definition: Zoltan2_AlgMultiJagged.hpp:7316
Zoltan2_MultiJagged_ReductionOps.hpp
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Zoltan2::uSignedSortItem::operator<=
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
Definition: Zoltan2_AlgMultiJagged.hpp:437