49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h"
81 #ifdef HAVE_ZOLTAN2_OMP
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
94 #define MIN_WORK_LAST_DIM 1000
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102 (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104 (Wachieved) / (wExpected) - 1
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
116 template <
typename Ordinal,
typename T>
135 size(s_), _EPSILON (std::numeric_limits<T>::
epsilon()){}
139 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const
141 for (Ordinal i=0; i < count; i++){
142 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
143 inoutBuffer[i] = inBuffer[i];
155 template <
typename T>
160 throw "cannot allocate memory";
172 template <
typename T>
188 template <
typename IT,
typename CT,
typename WT>
209 this->index = index_;
210 this->count = count_;
216 this->index = other.
index;
217 this->count = other.
count;
218 this->val = other.
val;
226 void set(IT index_ ,CT count_, WT *vals_){
227 this->index = index_;
228 this->count = count_;
234 this->index = other.
index;
235 this->count = other.
count;
236 this->val = other.
val;
241 assert (this->count == other.
count);
242 for(CT i = 0; i < this->
count; ++i){
248 if(this->val[i] < other.
val[i]){
257 return this->index < other.
index;
260 assert (this->count == other.
count);
261 for(CT i = 0; i < this->
count; ++i){
267 if(this->val[i] > other.
val[i]){
277 return this->index > other.
index;
284 template <
class IT,
class WT>
295 template <
class IT,
class WT>
301 IT i, ir=n, j, k, l=1;
302 IT jstack=0, istack[50];
311 for (j=l+1;j<=ir;j++)
317 if (arr[i].val <= aval)
333 if (arr[l+1].val > arr[ir].val)
337 if (arr[l].val > arr[ir].val)
341 if (arr[l+1].val > arr[l].val)
351 do i++;
while (arr[i].val < aval);
352 do j--;
while (arr[j].val > aval);
359 if (jstack > NSTACK){
360 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
379 template <
class IT,
class WT,
class SIGN>
388 if (this->signbit < rhs.
signbit){
392 else if (this->signbit == rhs.
signbit){
394 if (this->val < rhs.
val){
398 else if (this->val > rhs.
val){
414 if (this->signbit > rhs.
signbit){
418 else if (this->signbit == rhs.
signbit){
420 if (this->val < rhs.
val){
424 else if (this->val > rhs.
val){
438 return !(*
this > rhs);}
440 return !(*
this < rhs);}
446 template <
class IT,
class WT,
class SIGN>
451 IT i, ir=n, j, k, l=1;
452 IT jstack=0, istack[50];
460 for (j=l+1;j<=ir;j++)
482 if (arr[l+1] > arr[ir])
486 if (arr[l] > arr[ir])
490 if (arr[l+1] > arr[l])
499 do i++;
while (arr[i] < a);
500 do j--;
while (arr[j] > a);
507 if (jstack > NSTACK){
508 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
530 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
536 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
538 RCP<const Environment> mj_env;
539 RCP<const Comm<int> > mj_problemComm;
541 double imbalance_tolerance;
542 mj_part_t *part_no_array;
544 int coord_dim, num_weights_per_coord;
546 size_t initial_num_loc_coords;
549 mj_lno_t num_local_coords;
550 mj_gno_t num_global_coords;
552 mj_scalar_t **mj_coordinates;
553 mj_scalar_t **mj_weights;
554 bool *mj_uniform_parts;
555 mj_scalar_t **mj_part_sizes;
556 bool *mj_uniform_weights;
558 ArrayView<const mj_gno_t> mj_gnos;
559 size_t num_global_parts;
561 mj_gno_t *initial_mj_gnos;
562 mj_gno_t *current_mj_gnos;
563 int *owner_of_coordinate;
565 mj_lno_t *coordinate_permutations;
566 mj_lno_t *new_coordinate_permutations;
567 mj_part_t *assigned_part_ids;
570 mj_lno_t *new_part_xadj;
573 bool distribute_points_on_cut_lines;
574 mj_part_t max_concurrent_part_calculation;
577 int mj_user_recursion_depth;
578 bool mj_keep_part_boxes;
580 int check_migrate_avoid_migration_option;
583 mj_scalar_t minimum_migration_imbalance;
586 mj_part_t total_num_cut ;
587 mj_part_t total_num_part;
589 mj_part_t max_num_part_along_dim ;
590 mj_part_t max_num_cut_along_dim;
591 size_t max_num_total_part_along_dim;
593 mj_part_t total_dim_num_reduce_all;
594 mj_part_t last_dim_num_part;
598 RCP<Comm<int> > comm;
600 mj_scalar_t sEpsilon;
602 mj_scalar_t maxScalar_t;
603 mj_scalar_t minScalar_t;
605 mj_scalar_t *all_cut_coordinates;
606 mj_scalar_t *max_min_coords;
607 mj_scalar_t *process_cut_line_weight_to_put_left;
608 mj_scalar_t **thread_cut_line_weight_to_put_left;
614 mj_scalar_t *cut_coordinates_work_array;
617 mj_scalar_t *target_part_weights;
619 mj_scalar_t *cut_upper_bound_coordinates ;
620 mj_scalar_t *cut_lower_bound_coordinates ;
621 mj_scalar_t *cut_lower_bound_weights ;
622 mj_scalar_t *cut_upper_bound_weights ;
624 mj_scalar_t *process_local_min_max_coord_total_weight ;
625 mj_scalar_t *global_min_max_coord_total_weight ;
629 bool *is_cut_line_determined;
632 mj_part_t *my_incomplete_cut_count;
634 double **thread_part_weights;
636 double **thread_part_weight_work;
639 mj_scalar_t **thread_cut_left_closest_point;
641 mj_scalar_t **thread_cut_right_closest_point;
644 mj_lno_t **thread_point_counts;
646 mj_scalar_t *process_rectilinear_cut_weight;
647 mj_scalar_t *global_rectilinear_cut_weight;
653 mj_scalar_t *total_part_weight_left_right_closests ;
654 mj_scalar_t *global_total_part_weight_left_right_closests;
656 RCP<mj_partBoxVector_t> kept_boxes;
659 RCP<mj_partBox_t> global_box;
660 int myRank, myActualRank;
662 bool divide_to_prime_first;
671 void set_part_specifications();
678 inline mj_part_t get_part_count(
679 mj_part_t num_total_future,
685 void allocate_set_work_memory();
692 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
695 void compute_global_box();
711 mj_part_t update_part_num_arrays(
712 std::vector<mj_part_t> &num_partitioning_in_current_dim,
713 std::vector<mj_part_t> *future_num_part_in_parts,
714 std::vector<mj_part_t> *next_future_num_parts_in_parts,
715 mj_part_t &future_num_parts,
716 mj_part_t current_num_parts,
717 int current_iteration,
718 RCP<mj_partBoxVector_t> input_part_boxes,
719 RCP<mj_partBoxVector_t> output_part_boxes,
720 mj_part_t atomic_part_count);
733 void mj_get_local_min_max_coord_totW(
734 mj_lno_t coordinate_begin_index,
735 mj_lno_t coordinate_end_index,
736 mj_lno_t *mj_current_coordinate_permutations,
737 mj_scalar_t *mj_current_dim_coords,
738 mj_scalar_t &min_coordinate,
739 mj_scalar_t &max_coordinate,
740 mj_scalar_t &total_weight);
749 void mj_get_global_min_max_coord_totW(
750 mj_part_t current_concurrent_num_parts,
751 mj_scalar_t *local_min_max_total,
752 mj_scalar_t *global_min_max_total);
772 void mj_get_initial_cut_coords_target_weights(
773 mj_scalar_t min_coord,
774 mj_scalar_t max_coord,
776 mj_scalar_t global_weight,
777 mj_scalar_t *initial_cut_coords ,
778 mj_scalar_t *target_part_weights ,
780 std::vector <mj_part_t> *future_num_part_in_parts,
781 std::vector <mj_part_t> *next_future_num_parts_in_parts,
782 mj_part_t concurrent_current_part,
783 mj_part_t obtained_part_index);
797 void set_initial_coordinate_parts(
798 mj_scalar_t &max_coordinate,
799 mj_scalar_t &min_coordinate,
800 mj_part_t &concurrent_current_part_index,
801 mj_lno_t coordinate_begin_index,
802 mj_lno_t coordinate_end_index,
803 mj_lno_t *mj_current_coordinate_permutations,
804 mj_scalar_t *mj_current_dim_coords,
805 mj_part_t *mj_part_ids,
806 mj_part_t &partition_count);
819 mj_scalar_t *mj_current_dim_coords,
820 mj_scalar_t imbalanceTolerance,
821 mj_part_t current_work_part,
822 mj_part_t current_concurrent_num_parts,
823 mj_scalar_t *current_cut_coordinates,
824 mj_part_t total_incomplete_cut_count,
825 std::vector <mj_part_t> &num_partitioning_in_current_dim);
846 void mj_1D_part_get_thread_part_weights(
847 size_t total_part_count,
849 mj_scalar_t max_coord,
850 mj_scalar_t min_coord,
851 mj_lno_t coordinate_begin_index,
852 mj_lno_t coordinate_end_index,
853 mj_scalar_t *mj_current_dim_coords,
854 mj_scalar_t *temp_current_cut_coords,
855 bool *current_cut_status,
856 double *my_current_part_weights,
857 mj_scalar_t *my_current_left_closest,
858 mj_scalar_t *my_current_right_closest);
867 void mj_accumulate_thread_results(
868 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
869 mj_part_t current_work_part,
870 mj_part_t current_concurrent_num_parts);
902 void mj_get_new_cut_coordinates(
903 const size_t &num_total_part,
904 const mj_part_t &num_cuts,
905 const mj_scalar_t &max_coordinate,
906 const mj_scalar_t &min_coordinate,
907 const mj_scalar_t &global_total_weight,
908 const mj_scalar_t &used_imbalance_tolerance,
909 mj_scalar_t * current_global_part_weights,
910 const mj_scalar_t * current_local_part_weights,
911 const mj_scalar_t *current_part_target_weights,
912 bool *current_cut_line_determined,
913 mj_scalar_t *current_cut_coordinates,
914 mj_scalar_t *current_cut_upper_bounds,
915 mj_scalar_t *current_cut_lower_bounds,
916 mj_scalar_t *current_global_left_closest_points,
917 mj_scalar_t *current_global_right_closest_points,
918 mj_scalar_t * current_cut_lower_bound_weights,
919 mj_scalar_t * current_cut_upper_weights,
920 mj_scalar_t *new_current_cut_coordinates,
921 mj_scalar_t *current_part_cut_line_weight_to_put_left,
922 mj_part_t *rectilinear_cut_count,
923 mj_part_t &my_num_incomplete_cut);
934 void mj_calculate_new_cut_position (
935 mj_scalar_t cut_upper_bound,
936 mj_scalar_t cut_lower_bound,
937 mj_scalar_t cut_upper_weight,
938 mj_scalar_t cut_lower_weight,
939 mj_scalar_t expected_weight,
940 mj_scalar_t &new_cut_position);
952 void mj_create_new_partitions(
954 mj_scalar_t *mj_current_dim_coords,
955 mj_scalar_t *current_concurrent_cut_coordinate,
956 mj_lno_t coordinate_begin,
957 mj_lno_t coordinate_end,
958 mj_scalar_t *used_local_cut_line_weight_to_left,
959 double **used_thread_part_weight_work,
960 mj_lno_t *out_part_xadj);
984 bool mj_perform_migration(
985 mj_part_t in_num_parts,
986 mj_part_t &out_num_parts,
987 std::vector<mj_part_t> *next_future_num_parts_in_parts,
988 mj_part_t &output_part_begin_index,
989 size_t migration_reduce_all_population,
990 mj_lno_t num_coords_for_last_dim_part,
991 std::string iteration,
992 RCP<mj_partBoxVector_t> &input_part_boxes,
993 RCP<mj_partBoxVector_t> &output_part_boxes);
1004 void get_processor_num_points_in_parts(
1005 mj_part_t num_procs,
1006 mj_part_t num_parts,
1007 mj_gno_t *&num_points_in_all_processor_parts);
1021 bool mj_check_to_migrate(
1022 size_t migration_reduce_all_population,
1023 mj_lno_t num_coords_for_last_dim_part,
1024 mj_part_t num_procs,
1025 mj_part_t num_parts,
1026 mj_gno_t *num_points_in_all_processor_parts);
1046 void mj_migration_part_proc_assignment(
1047 mj_gno_t * num_points_in_all_processor_parts,
1048 mj_part_t num_parts,
1049 mj_part_t num_procs,
1050 mj_lno_t *send_count_to_each_proc,
1051 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1052 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1053 mj_part_t &out_num_part,
1054 std::vector<mj_part_t> &out_part_indices,
1055 mj_part_t &output_part_numbering_begin_index,
1056 int *coordinate_destinations);
1074 void mj_assign_proc_to_parts(
1075 mj_gno_t * num_points_in_all_processor_parts,
1076 mj_part_t num_parts,
1077 mj_part_t num_procs,
1078 mj_lno_t *send_count_to_each_proc,
1079 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1080 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1081 mj_part_t &out_part_index,
1082 mj_part_t &output_part_numbering_begin_index,
1083 int *coordinate_destinations);
1095 void assign_send_destinations(
1096 mj_part_t num_parts,
1097 mj_part_t *part_assignment_proc_begin_indices,
1098 mj_part_t *processor_chains_in_parts,
1099 mj_lno_t *send_count_to_each_proc,
1100 int *coordinate_destinations);
1114 void assign_send_destinations2(
1115 mj_part_t num_parts,
1117 int *coordinate_destinations,
1118 mj_part_t &output_part_numbering_begin_index,
1119 std::vector<mj_part_t> *next_future_num_parts_in_parts);
1137 void mj_assign_parts_to_procs(
1138 mj_gno_t * num_points_in_all_processor_parts,
1139 mj_part_t num_parts,
1140 mj_part_t num_procs,
1141 mj_lno_t *send_count_to_each_proc,
1142 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1143 mj_part_t &out_num_part,
1144 std::vector<mj_part_t> &out_part_indices,
1145 mj_part_t &output_part_numbering_begin_index,
1146 int *coordinate_destinations);
1160 void mj_migrate_coords(
1161 mj_part_t num_procs,
1162 mj_lno_t &num_new_local_points,
1163 std::string iteration,
1164 int *coordinate_destinations,
1165 mj_part_t num_parts);
1173 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1181 void fill_permutation_array(
1182 mj_part_t output_num_parts,
1183 mj_part_t num_parts);
1193 void set_final_parts(
1194 mj_part_t current_num_parts,
1195 mj_part_t output_part_begin_index,
1196 RCP<mj_partBoxVector_t> &output_part_boxes,
1197 bool is_data_ever_migrated);
1200 void free_work_memory();
1214 void create_consistent_chunks(
1215 mj_part_t num_parts,
1216 mj_scalar_t *mj_current_dim_coords,
1217 mj_scalar_t *current_concurrent_cut_coordinate,
1218 mj_lno_t coordinate_begin,
1219 mj_lno_t coordinate_end,
1220 mj_scalar_t *used_local_cut_line_weight_to_left,
1221 mj_lno_t *out_part_xadj,
1228 mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1229 mj_part_t largest_factor = 1;
1230 mj_part_t n = num_parts;
1231 mj_part_t divisor = 2;
1233 while (n % divisor == 0){
1235 largest_factor = divisor;
1238 if (divisor * divisor > n){
1245 return largest_factor;
1279 const RCP<const Environment> &env,
1280 RCP<
const Comm<int> > &problemComm,
1282 double imbalance_tolerance,
1283 size_t num_global_parts,
1284 mj_part_t *part_no_array,
1285 int recursion_depth,
1288 mj_lno_t num_local_coords,
1289 mj_gno_t num_global_coords,
1290 const mj_gno_t *initial_mj_gnos,
1291 mj_scalar_t **mj_coordinates,
1293 int num_weights_per_coord,
1294 bool *mj_uniform_weights,
1295 mj_scalar_t **mj_weights,
1296 bool *mj_uniform_parts,
1297 mj_scalar_t **mj_part_sizes,
1299 mj_part_t *&result_assigned_part_ids,
1300 mj_gno_t *&result_mj_gnos
1312 bool distribute_points_on_cut_lines_,
1313 int max_concurrent_part_calculation_,
1314 int check_migrate_avoid_migration_option_,
1315 mj_scalar_t minimum_migration_imbalance_,
int migration_type_ = 0);
1328 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1355 const RCP<const Environment> &env,
1356 mj_lno_t num_total_coords,
1357 mj_lno_t num_selected_coords,
1358 size_t num_target_part,
1360 mj_scalar_t **mj_coordinates,
1361 mj_lno_t *initial_selected_coords_output_permutation,
1362 mj_lno_t *output_xadj,
1363 int recursion_depth,
1364 const mj_part_t *part_no_array,
1365 bool partition_along_longest_dim,
1366 int num_ranks_per_node,
1367 bool divide_to_prime_first_);
1395 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1398 const RCP<const Environment> &env,
1399 mj_lno_t num_total_coords,
1400 mj_lno_t num_selected_coords,
1401 size_t num_target_part,
1403 mj_scalar_t **mj_coordinates_,
1404 mj_lno_t *inital_adjList_output_adjlist,
1405 mj_lno_t *output_xadj,
1407 const mj_part_t *part_no_array_,
1408 bool partition_along_longest_dim,
1409 int num_ranks_per_node,
1410 bool divide_to_prime_first_
1415 const RCP<Comm<int> > commN;
1416 this->mj_problemComm =
1417 Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1419 Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1420 this->myActualRank = this->myRank = 1;
1422 #ifdef HAVE_ZOLTAN2_OMP
1427 this->divide_to_prime_first = divide_to_prime_first_;
1432 this->imbalance_tolerance = 0;
1433 this->num_global_parts = num_target_part;
1434 this->part_no_array = (mj_part_t *)part_no_array_;
1435 this->recursion_depth = rd;
1437 this->coord_dim = coord_dim_;
1438 this->num_local_coords = num_total_coords;
1439 this->num_global_coords = num_total_coords;
1440 this->mj_coordinates = mj_coordinates_;
1444 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1446 this->num_weights_per_coord = 0;
1447 bool *tmp_mj_uniform_weights =
new bool[1];
1448 this->mj_uniform_weights = tmp_mj_uniform_weights ;
1449 this->mj_uniform_weights[0] =
true;
1451 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1452 this->mj_weights = tmp_mj_weights;
1454 bool *tmp_mj_uniform_parts =
new bool[1];
1455 this->mj_uniform_parts = tmp_mj_uniform_parts;
1456 this->mj_uniform_parts[0] =
true;
1458 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1459 this->mj_part_sizes = tmp_mj_part_sizes;
1460 this->mj_part_sizes[0] = NULL;
1462 this->num_threads = 1;
1463 this->set_part_specifications();
1465 this->allocate_set_work_memory();
1467 this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1468 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1469 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1472 mj_part_t current_num_parts = 1;
1474 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1476 mj_part_t future_num_parts = this->total_num_part;
1478 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1479 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1480 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1481 RCP<mj_partBoxVector_t> t1;
1482 RCP<mj_partBoxVector_t> t2;
1485 std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1487 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1488 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1490 for (
int i = 0; i < this->recursion_depth; ++i){
1494 std::vector <mj_part_t> num_partitioning_in_current_dim;
1508 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1509 future_num_part_in_parts = next_future_num_parts_in_parts;
1510 next_future_num_parts_in_parts = tmpPartVect;
1515 next_future_num_parts_in_parts->clear();
1519 mj_part_t output_part_count_in_dimension =
1520 this->update_part_num_arrays(
1521 num_partitioning_in_current_dim,
1522 future_num_part_in_parts,
1523 next_future_num_parts_in_parts,
1528 t2, num_ranks_per_node);
1533 if(output_part_count_in_dimension == current_num_parts) {
1534 tmpPartVect= future_num_part_in_parts;
1535 future_num_part_in_parts = next_future_num_parts_in_parts;
1536 next_future_num_parts_in_parts = tmpPartVect;
1541 std::string istring = Teuchos::toString<int>(i);
1545 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1548 mj_part_t output_part_index = 0;
1551 mj_part_t output_coordinate_end_index = 0;
1553 mj_part_t current_work_part = 0;
1554 mj_part_t current_concurrent_num_parts = 1;
1556 mj_part_t obtained_part_index = 0;
1559 int coordInd = i % this->coord_dim;
1560 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1564 for (; current_work_part < current_num_parts;
1565 current_work_part += current_concurrent_num_parts){
1571 mj_part_t actual_work_part_count = 0;
1575 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1576 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1580 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1583 ++actual_work_part_count;
1584 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1585 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1595 if(partition_along_longest_dim){
1597 mj_scalar_t best_weight_coord = 0;
1598 for (
int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1599 mj_scalar_t best_min_coord = 0;
1600 mj_scalar_t best_max_coord = 0;
1603 this->mj_get_local_min_max_coord_totW(
1604 coordinate_begin_index,
1605 coordinate_end_index,
1606 this->coordinate_permutations,
1607 this->mj_coordinates[coord_traverse_ind],
1613 coord_dim_mins[coord_traverse_ind] = best_min_coord;
1614 coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1615 mj_scalar_t best_range = best_max_coord - best_min_coord;
1616 coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1617 coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1618 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1622 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1623 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1634 mj_current_dim_coords = this->mj_coordinates[coordInd];
1636 this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1637 this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1638 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1642 this->mj_get_local_min_max_coord_totW(
1643 coordinate_begin_index,
1644 coordinate_end_index,
1645 this->coordinate_permutations,
1646 mj_current_dim_coords,
1647 this->process_local_min_max_coord_total_weight[kk],
1648 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1649 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1655 if (actual_work_part_count > 0){
1657 this->mj_get_global_min_max_coord_totW(
1658 current_concurrent_num_parts,
1659 this->process_local_min_max_coord_total_weight,
1660 this->global_min_max_coord_total_weight);
1664 mj_part_t total_incomplete_cut_count = 0;
1669 mj_part_t concurrent_part_cut_shift = 0;
1670 mj_part_t concurrent_part_part_shift = 0;
1673 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1674 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1675 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1676 current_concurrent_num_parts];
1677 mj_scalar_t global_total_weight =
1678 this->global_min_max_coord_total_weight[kk +
1679 2 * current_concurrent_num_parts];
1681 mj_part_t concurrent_current_part_index = current_work_part + kk;
1683 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1685 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1686 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1687 concurrent_part_part_shift;
1689 concurrent_part_cut_shift += partition_count - 1;
1691 concurrent_part_part_shift += partition_count;
1695 if(partition_count > 1 && min_coordinate <= max_coordinate){
1699 total_incomplete_cut_count += partition_count - 1;
1702 this->my_incomplete_cut_count[kk] = partition_count - 1;
1705 this->mj_get_initial_cut_coords_target_weights(
1708 partition_count - 1,
1709 global_total_weight,
1711 current_target_part_weights,
1712 future_num_part_in_parts,
1713 next_future_num_parts_in_parts,
1714 concurrent_current_part_index,
1715 obtained_part_index);
1717 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1718 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1721 this->set_initial_coordinate_parts(
1724 concurrent_current_part_index,
1725 coordinate_begin_index, coordinate_end_index,
1726 this->coordinate_permutations,
1727 mj_current_dim_coords,
1728 this->assigned_part_ids,
1734 this->my_incomplete_cut_count[kk] = 0;
1736 obtained_part_index += partition_count;
1740 mj_scalar_t used_imbalance = 0;
1745 mj_current_dim_coords,
1748 current_concurrent_num_parts,
1749 current_cut_coordinates,
1750 total_incomplete_cut_count,
1751 num_partitioning_in_current_dim);
1754 obtained_part_index += current_concurrent_num_parts;
1760 mj_part_t output_array_shift = 0;
1761 mj_part_t cut_shift = 0;
1762 size_t tlr_shift = 0;
1763 size_t partweight_array_shift = 0;
1765 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1766 mj_part_t current_concurrent_work_part = current_work_part + kk;
1767 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1770 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1771 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1773 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1774 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1776 cut_shift += num_parts - 1;
1777 tlr_shift += (4 *(num_parts - 1) + 1);
1778 output_array_shift += num_parts;
1779 partweight_array_shift += (2 * (num_parts - 1) + 1);
1783 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1784 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1786 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1787 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1790 for(
int ii = 0; ii < this->num_threads; ++ii){
1791 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1796 this->create_consistent_chunks(
1798 mj_current_dim_coords,
1799 current_concurrent_cut_coordinate,
1802 used_local_cut_line_weight_to_left,
1803 this->new_part_xadj + output_part_index + output_array_shift,
1805 partition_along_longest_dim,
1806 p_coord_dimension_range_sorted);
1811 mj_lno_t part_size = coordinate_end - coordinate_begin;
1812 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1813 memcpy(this->new_coordinate_permutations + coordinate_begin,
1814 this->coordinate_permutations + coordinate_begin,
1815 part_size *
sizeof(mj_lno_t));
1820 cut_shift += num_parts - 1;
1821 tlr_shift += (4 *(num_parts - 1) + 1);
1822 output_array_shift += num_parts;
1823 partweight_array_shift += (2 * (num_parts - 1) + 1);
1832 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1833 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1834 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1836 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1838 mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1839 mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1841 for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1842 mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1844 mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1849 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1851 output_part_index += num_parts ;
1858 current_num_parts = output_part_count_in_dimension;
1861 mj_lno_t * tmp = this->coordinate_permutations;
1862 this->coordinate_permutations = this->new_coordinate_permutations;
1863 this->new_coordinate_permutations = tmp;
1865 freeArray<mj_lno_t>(this->part_xadj);
1866 this->part_xadj = this->new_part_xadj;
1867 this->new_part_xadj = NULL;
1870 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1871 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1876 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1877 output_xadj[i+1] = this->part_xadj[i];
1880 delete future_num_part_in_parts;
1881 delete next_future_num_parts_in_parts;
1884 freeArray<mj_part_t>(this->assigned_part_ids);
1885 freeArray<mj_gno_t>(this->initial_mj_gnos);
1886 freeArray<mj_gno_t>(this->current_mj_gnos);
1887 freeArray<bool>(tmp_mj_uniform_weights);
1888 freeArray<bool>(tmp_mj_uniform_parts);
1889 freeArray<mj_scalar_t *>(tmp_mj_weights);
1890 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1892 this->free_work_memory();
1894 #ifdef HAVE_ZOLTAN2_OMP
1902 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1905 mj_env(), mj_problemComm(), imbalance_tolerance(0),
1906 part_no_array(NULL), recursion_depth(0), coord_dim(0),
1907 num_weights_per_coord(0), initial_num_loc_coords(0),
1908 initial_num_glob_coords(0),
1909 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1910 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1911 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1912 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1913 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1914 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1915 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1916 mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
1917 check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
1918 num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1919 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1920 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1921 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1922 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1923 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1924 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1925 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1926 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1927 thread_part_weights(NULL), thread_part_weight_work(NULL),
1928 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1929 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1930 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1931 global_total_part_weight_left_right_closests(NULL),
1932 kept_boxes(),global_box(),
1933 myRank(0), myActualRank(0), divide_to_prime_first(false)
1938 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1939 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1947 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1949 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1952 return this->global_box;
1958 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1961 this->mj_keep_part_boxes =
true;
1972 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1976 this->total_num_cut = 0;
1977 this->total_num_part = 1;
1978 this->max_num_part_along_dim = 0;
1979 this->total_dim_num_reduce_all = 0;
1980 this->last_dim_num_part = 1;
1983 this->max_num_cut_along_dim = 0;
1984 this->max_num_total_part_along_dim = 0;
1986 if (this->part_no_array){
1988 for (
int i = 0; i < this->recursion_depth; ++i){
1989 this->total_dim_num_reduce_all += this->total_num_part;
1990 this->total_num_part *= this->part_no_array[i];
1991 if(this->part_no_array[i] > this->max_num_part_along_dim) {
1992 this->max_num_part_along_dim = this->part_no_array[i];
1995 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1996 this->num_global_parts = this->total_num_part;
1998 mj_part_t future_num_parts = this->num_global_parts;
2001 for (
int i = 0; i < this->recursion_depth; ++i){
2003 mj_part_t maxNoPartAlongI = this->get_part_count(
2004 future_num_parts, 1.0f / (this->recursion_depth - i));
2006 if (maxNoPartAlongI > this->max_num_part_along_dim){
2007 this->max_num_part_along_dim = maxNoPartAlongI;
2010 mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
2011 if (future_num_parts % maxNoPartAlongI){
2014 future_num_parts = nfutureNumParts;
2016 this->total_num_part = this->num_global_parts;
2018 if (this->divide_to_prime_first){
2019 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2020 this->last_dim_num_part = this->num_global_parts;
2029 for (
int i = 0; i < this->recursion_depth; ++i){
2030 this->total_dim_num_reduce_all += p;
2031 p *= this->max_num_part_along_dim;
2034 if (p / this->max_num_part_along_dim > this->num_global_parts){
2035 this->last_dim_num_part = this->num_global_parts;
2038 this->last_dim_num_part = p / this->max_num_part_along_dim;
2044 this->total_num_cut = this->total_num_part - 1;
2045 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2046 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2050 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2051 if(this->mj_problemComm->getRank() == 0){
2052 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2053 ") has been set bigger than maximum amount that can be used." <<
2054 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2056 this->max_concurrent_part_calculation = this->last_dim_num_part;
2065 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2067 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2068 mj_part_t num_total_future,
2071 double fp = pow(num_total_future,
root);
2072 mj_part_t ip = mj_part_t (fp);
2073 if (fp - ip < this->fEpsilon * 100){
2095 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2097 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2098 std::vector <mj_part_t> &num_partitioning_in_current_dim,
2099 std::vector<mj_part_t> *future_num_part_in_parts,
2100 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2101 mj_part_t &future_num_parts,
2102 mj_part_t current_num_parts,
2103 int current_iteration,
2104 RCP<mj_partBoxVector_t> input_part_boxes,
2105 RCP<mj_partBoxVector_t> output_part_boxes,
2106 mj_part_t atomic_part_count
2109 mj_part_t output_num_parts = 0;
2110 if(this->part_no_array){
2115 mj_part_t p = this->part_no_array[current_iteration];
2117 std::cout <<
"i:" << current_iteration <<
" p is given as:" << p << std::endl;
2121 return current_num_parts;
2124 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2125 num_partitioning_in_current_dim.push_back(p);
2138 future_num_parts /= num_partitioning_in_current_dim[0];
2139 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2141 if (this->mj_keep_part_boxes){
2142 for (mj_part_t k = 0; k < current_num_parts; ++k){
2144 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2145 output_part_boxes->push_back((*input_part_boxes)[k]);
2153 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2154 next_future_num_parts_in_parts->push_back(future_num_parts);
2164 future_num_parts = 1;
2168 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2170 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2174 mj_part_t num_partitions_in_current_dim =
2175 this->get_part_count(
2176 future_num_parts_of_part_ii,
2177 1.0 / (this->recursion_depth - current_iteration)
2180 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2181 std::cerr <<
"ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2182 << num_partitions_in_current_dim <<
"this->max_num_part_along_dim:"
2183 << this->max_num_part_along_dim <<
2184 " this->recursion_depth:" << this->recursion_depth <<
2185 " current_iteration:" << current_iteration <<
2186 " future_num_parts_of_part_ii:" << future_num_parts_of_part_ii <<
2187 " might need to fix max part no calculation for largest_prime_first partitioning" <<
2192 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2194 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2195 if (this->divide_to_prime_first){
2198 output_num_parts += num_partitions_in_current_dim;
2199 if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2200 atomic_part_count = 1;
2203 largest_prime_factor = this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2208 if (largest_prime_factor < num_partitions_in_current_dim){
2209 largest_prime_factor = num_partitions_in_current_dim;
2213 mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2215 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2218 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2220 mj_part_t my_ideal_primescale = ideal_prime_scale;
2222 if (iii < (largest_prime_factor) % num_partitions_in_current_dim){
2223 ++my_ideal_primescale;
2226 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part * my_ideal_primescale;
2229 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor){
2231 ++num_future_parts_for_part_iii;
2234 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2237 if (this->mj_keep_part_boxes){
2238 output_part_boxes->push_back((*input_part_boxes)[ii]);
2242 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2251 output_num_parts += num_partitions_in_current_dim;
2255 if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2256 atomic_part_count = 1;
2259 mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2260 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2261 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2264 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2266 ++num_future_parts_for_part_iii;
2269 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2272 if (this->mj_keep_part_boxes){
2273 output_part_boxes->push_back((*input_part_boxes)[ii]);
2277 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2282 return output_num_parts;
2289 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2291 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2294 this->owner_of_coordinate = NULL;
2299 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2301 #ifdef HAVE_ZOLTAN2_OMP
2302 #pragma omp parallel for
2304 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2305 this->coordinate_permutations[i] = i;
2309 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2311 this->assigned_part_ids = NULL;
2312 if(this->num_local_coords > 0){
2313 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2319 this->part_xadj = allocMemory<mj_lno_t>(1);
2320 this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);
2322 this->new_part_xadj = NULL;
2328 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2330 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2332 this->process_cut_line_weight_to_put_left = NULL;
2333 this->thread_cut_line_weight_to_put_left = NULL;
2335 if(this->distribute_points_on_cut_lines){
2336 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2337 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2338 for(
int i = 0; i < this->num_threads; ++i){
2339 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2341 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2342 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2350 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2351 this->max_concurrent_part_calculation);
2355 this->target_part_weights = allocMemory<mj_scalar_t>(
2356 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2359 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2360 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2361 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2362 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2364 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2365 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2369 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2372 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2374 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2376 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2379 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2381 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2384 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2386 for(
int i = 0; i < this->num_threads; ++i){
2388 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2389 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2390 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2391 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2397 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2398 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2401 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2402 for (
int i=0; i < this->coord_dim; i++){
2403 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2404 #ifdef HAVE_ZOLTAN2_OMP
2405 #pragma omp parallel for
2407 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2408 coord[i][j] = this->mj_coordinates[i][j];
2410 this->mj_coordinates = coord;
2413 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2414 mj_scalar_t **
weights = allocMemory<mj_scalar_t *>(criteria_dim);
2416 for (
int i=0; i < criteria_dim; i++){
2419 for (
int i=0; i < this->num_weights_per_coord; i++){
2420 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2421 #ifdef HAVE_ZOLTAN2_OMP
2422 #pragma omp parallel for
2424 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2425 weights[i][j] = this->mj_weights[i][j];
2429 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2430 #ifdef HAVE_ZOLTAN2_OMP
2431 #pragma omp parallel for
2433 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2434 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2436 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2438 #ifdef HAVE_ZOLTAN2_OMP
2439 #pragma omp parallel for
2441 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2442 this->owner_of_coordinate[j] = this->myActualRank;
2447 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2449 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2452 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2454 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2456 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2458 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2460 for (
int i = 0; i < this->coord_dim; ++i){
2461 mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2462 mj_scalar_t localMax = -localMin;
2463 if (localMax > 0) localMax = 0;
2466 for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2467 if (this->mj_coordinates[i][j] < localMin){
2468 localMin = this->mj_coordinates[i][j];
2470 if (this->mj_coordinates[i][j] > localMax){
2471 localMax = this->mj_coordinates[i][j];
2480 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2481 this->coord_dim, mins, gmins
2485 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2486 this->coord_dim, maxs, gmaxs
2492 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2494 freeArray<mj_scalar_t>(mins);
2495 freeArray<mj_scalar_t>(gmins);
2496 freeArray<mj_scalar_t>(maxs);
2497 freeArray<mj_scalar_t>(gmaxs);
2505 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2508 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2511 mj_partBox_t tmp_box(*global_box);
2512 initial_partitioning_boxes->push_back(tmp_box);
2525 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2527 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2528 mj_lno_t coordinate_begin_index,
2529 mj_lno_t coordinate_end_index,
2530 mj_lno_t *mj_current_coordinate_permutations,
2531 mj_scalar_t *mj_current_dim_coords,
2532 mj_scalar_t &min_coordinate,
2533 mj_scalar_t &max_coordinate,
2534 mj_scalar_t &total_weight){
2538 if(coordinate_begin_index >= coordinate_end_index)
2540 min_coordinate = this->maxScalar_t;
2541 max_coordinate = this->minScalar_t;
2545 mj_scalar_t my_total_weight = 0;
2546 #ifdef HAVE_ZOLTAN2_OMP
2547 #pragma omp parallel num_threads(this->num_threads)
2551 if (this->mj_uniform_weights[0]) {
2552 #ifdef HAVE_ZOLTAN2_OMP
2556 my_total_weight = coordinate_end_index - coordinate_begin_index;
2562 #ifdef HAVE_ZOLTAN2_OMP
2563 #pragma omp for reduction(+:my_total_weight)
2565 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2566 int i = mj_current_coordinate_permutations[ii];
2567 my_total_weight += this->mj_weights[0][i];
2571 int my_thread_id = 0;
2572 #ifdef HAVE_ZOLTAN2_OMP
2573 my_thread_id = omp_get_thread_num();
2575 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2576 my_thread_min_coord=my_thread_max_coord
2577 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2580 #ifdef HAVE_ZOLTAN2_OMP
2583 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2584 int i = mj_current_coordinate_permutations[j];
2585 if(mj_current_dim_coords[i] > my_thread_max_coord)
2586 my_thread_max_coord = mj_current_dim_coords[i];
2587 if(mj_current_dim_coords[i] < my_thread_min_coord)
2588 my_thread_min_coord = mj_current_dim_coords[i];
2590 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2591 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2593 #ifdef HAVE_ZOLTAN2_OMP
2596 #pragma omp single nowait
2599 min_coordinate = this->max_min_coords[0];
2600 for(
int i = 1; i < this->num_threads; ++i){
2601 if(this->max_min_coords[i] < min_coordinate)
2602 min_coordinate = this->max_min_coords[i];
2606 #ifdef HAVE_ZOLTAN2_OMP
2607 #pragma omp single nowait
2610 max_coordinate = this->max_min_coords[this->num_threads];
2611 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2612 if(this->max_min_coords[i] > max_coordinate)
2613 max_coordinate = this->max_min_coords[i];
2617 total_weight = my_total_weight;
2629 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2631 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2632 mj_part_t current_concurrent_num_parts,
2633 mj_scalar_t *local_min_max_total,
2634 mj_scalar_t *global_min_max_total){
2639 if(this->comm->getSize() > 1){
2642 current_concurrent_num_parts,
2643 current_concurrent_num_parts,
2644 current_concurrent_num_parts);
2646 reduceAll<int, mj_scalar_t>(
2649 3 * current_concurrent_num_parts,
2650 local_min_max_total,
2651 global_min_max_total);
2656 mj_part_t s = 3 * current_concurrent_num_parts;
2657 for (mj_part_t i = 0; i < s; ++i){
2658 global_min_max_total[i] = local_min_max_total[i];
2683 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2685 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2686 mj_scalar_t min_coord,
2687 mj_scalar_t max_coord,
2688 mj_part_t num_cuts ,
2689 mj_scalar_t global_weight,
2690 mj_scalar_t *initial_cut_coords ,
2691 mj_scalar_t *current_target_part_weights ,
2693 std::vector <mj_part_t> *future_num_part_in_parts,
2694 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2695 mj_part_t concurrent_current_part,
2696 mj_part_t obtained_part_index
2699 mj_scalar_t coord_range = max_coord - min_coord;
2700 if(this->mj_uniform_parts[0]){
2702 mj_part_t cumulative = 0;
2704 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2708 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2714 for(mj_part_t i = 0; i < num_cuts; ++i){
2715 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2723 current_target_part_weights[i] = cumulative * unit_part_weight;
2727 initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2729 current_target_part_weights[num_cuts] = 1;
2733 if (this->mj_uniform_weights[0]){
2734 for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2736 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2741 std::cerr <<
"MJ does not support non uniform part weights" << std::endl;
2759 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2761 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
2762 mj_scalar_t &max_coordinate,
2763 mj_scalar_t &min_coordinate,
2764 mj_part_t &concurrent_current_part_index,
2765 mj_lno_t coordinate_begin_index,
2766 mj_lno_t coordinate_end_index,
2767 mj_lno_t *mj_current_coordinate_permutations,
2768 mj_scalar_t *mj_current_dim_coords,
2769 mj_part_t *mj_part_ids,
2770 mj_part_t &partition_count
2772 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2776 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp parallel for
2780 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2788 mj_scalar_t slice = coordinate_range / partition_count;
2790 #ifdef HAVE_ZOLTAN2_OMP
2791 #pragma omp parallel for
2793 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2795 mj_lno_t iii = mj_current_coordinate_permutations[ii];
2796 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2797 mj_part_ids[iii] = 2 * pp;
2813 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2815 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
2816 mj_scalar_t *mj_current_dim_coords,
2817 mj_scalar_t used_imbalance_tolerance,
2818 mj_part_t current_work_part,
2819 mj_part_t current_concurrent_num_parts,
2820 mj_scalar_t *current_cut_coordinates,
2821 mj_part_t total_incomplete_cut_count,
2822 std::vector <mj_part_t> &num_partitioning_in_current_dim
2826 mj_part_t rectilinear_cut_count = 0;
2827 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2830 *reductionOp = NULL;
2832 <mj_part_t, mj_scalar_t>(
2833 &num_partitioning_in_current_dim ,
2835 current_concurrent_num_parts);
2837 size_t total_reduction_size = 0;
2838 #ifdef HAVE_ZOLTAN2_OMP
2839 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
2843 #ifdef HAVE_ZOLTAN2_OMP
2844 me = omp_get_thread_num();
2846 double *my_thread_part_weights = this->thread_part_weights[me];
2847 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2848 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2850 #ifdef HAVE_ZOLTAN2_OMP
2856 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2858 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2859 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2860 total_reduction_size += (4 * num_cut_in_dim + 1);
2862 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2863 this->is_cut_line_determined[next] =
false;
2864 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
2865 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
2867 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
2868 this->cut_lower_bound_weights[next] = 0;
2870 if(this->distribute_points_on_cut_lines){
2871 this->process_cut_line_weight_to_put_left[next] = 0;
2882 while (total_incomplete_cut_count != 0){
2884 mj_part_t concurrent_cut_shifts = 0;
2885 size_t total_part_shift = 0;
2887 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2888 mj_part_t num_parts = -1;
2889 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2891 mj_part_t num_cuts = num_parts - 1;
2892 size_t total_part_count = num_parts + size_t (num_cuts) ;
2893 if (this->my_incomplete_cut_count[kk] > 0){
2896 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2897 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2898 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2899 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2901 mj_part_t conccurent_current_part = current_work_part + kk;
2902 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2903 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2904 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2906 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2907 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2910 this->mj_1D_part_get_thread_part_weights(
2915 coordinate_begin_index,
2916 coordinate_end_index,
2917 mj_current_dim_coords,
2918 temp_current_cut_coords,
2920 my_current_part_weights,
2921 my_current_left_closest,
2922 my_current_right_closest);
2926 concurrent_cut_shifts += num_cuts;
2927 total_part_shift += total_part_count;
2931 this->mj_accumulate_thread_results(
2932 num_partitioning_in_current_dim,
2934 current_concurrent_num_parts);
2937 #ifdef HAVE_ZOLTAN2_OMP
2941 if(this->comm->getSize() > 1){
2942 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2943 total_reduction_size,
2944 this->total_part_weight_left_right_closests,
2945 this->global_total_part_weight_left_right_closests);
2950 this->global_total_part_weight_left_right_closests,
2951 this->total_part_weight_left_right_closests,
2952 total_reduction_size *
sizeof(mj_scalar_t));
2957 mj_part_t cut_shift = 0;
2960 size_t tlr_shift = 0;
2961 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2962 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2963 mj_part_t num_cuts = num_parts - 1;
2964 size_t num_total_part = num_parts + size_t (num_cuts) ;
2969 if (this->my_incomplete_cut_count[kk] == 0) {
2970 cut_shift += num_cuts;
2971 tlr_shift += (num_total_part + 2 * num_cuts);
2975 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2976 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2977 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
2978 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
2979 mj_scalar_t *current_global_part_weights = current_global_tlr;
2980 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2982 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2983 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2985 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2986 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2987 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2988 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2989 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2990 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2991 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2993 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2996 this->mj_get_new_cut_coordinates(
3001 global_total_weight,
3002 used_imbalance_tolerance,
3003 current_global_part_weights,
3004 current_local_part_weights,
3005 current_part_target_weights,
3006 current_cut_line_determined,
3007 temp_cut_coords + cut_shift,
3008 current_cut_upper_bounds,
3009 current_cut_lower_bounds,
3010 current_global_left_closest_points,
3011 current_global_right_closest_points,
3012 current_cut_lower_bound_weights,
3013 current_cut_upper_weights,
3014 this->cut_coordinates_work_array +cut_shift,
3015 current_part_cut_line_weight_to_put_left,
3016 &rectilinear_cut_count,
3017 this->my_incomplete_cut_count[kk]);
3019 cut_shift += num_cuts;
3020 tlr_shift += (num_total_part + 2 * num_cuts);
3021 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3022 #ifdef HAVE_ZOLTAN2_OMP
3026 total_incomplete_cut_count -= iteration_complete_cut_count;
3031 #ifdef HAVE_ZOLTAN2_OMP
3037 mj_scalar_t *t = temp_cut_coords;
3038 temp_cut_coords = this->cut_coordinates_work_array;
3039 this->cut_coordinates_work_array = t;
3050 if (current_cut_coordinates != temp_cut_coords){
3051 #ifdef HAVE_ZOLTAN2_OMP
3056 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3057 mj_part_t num_parts = -1;
3058 num_parts = num_partitioning_in_current_dim[current_work_part + i];
3059 mj_part_t num_cuts = num_parts - 1;
3061 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3062 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3068 #ifdef HAVE_ZOLTAN2_OMP
3072 this->cut_coordinates_work_array = temp_cut_coords;
3099 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3101 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3102 size_t total_part_count,
3104 mj_scalar_t max_coord,
3105 mj_scalar_t min_coord,
3106 mj_lno_t coordinate_begin_index,
3107 mj_lno_t coordinate_end_index,
3108 mj_scalar_t *mj_current_dim_coords,
3109 mj_scalar_t *temp_current_cut_coords,
3110 bool *current_cut_status,
3111 double *my_current_part_weights,
3112 mj_scalar_t *my_current_left_closest,
3113 mj_scalar_t *my_current_right_closest){
3116 for (
size_t i = 0; i < total_part_count; ++i){
3117 my_current_part_weights[i] = 0;
3122 for(mj_part_t i = 0; i < num_cuts; ++i){
3123 my_current_left_closest[i] = min_coord - 1;
3124 my_current_right_closest[i] = max_coord + 1;
3127 mj_scalar_t minus_EPSILON = -this->sEpsilon;
3128 #ifdef HAVE_ZOLTAN2_OMP
3134 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3135 int i = this->coordinate_permutations[ii];
3139 mj_part_t j = this->assigned_part_ids[i] / 2;
3145 mj_part_t lower_cut_index = 0;
3146 mj_part_t upper_cut_index = num_cuts - 1;
3148 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3149 bool is_inserted =
false;
3150 bool is_on_left_of_cut =
false;
3151 bool is_on_right_of_cut =
false;
3152 mj_part_t last_compared_part = -1;
3154 mj_scalar_t coord = mj_current_dim_coords[i];
3156 while(upper_cut_index >= lower_cut_index)
3159 last_compared_part = -1;
3160 is_on_left_of_cut =
false;
3161 is_on_right_of_cut =
false;
3162 mj_scalar_t cut = temp_current_cut_coords[j];
3163 mj_scalar_t distance_to_cut = coord - cut;
3164 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
3167 if(abs_distance_to_cut < this->sEpsilon){
3169 my_current_part_weights[j * 2 + 1] += w;
3170 this->assigned_part_ids[i] = j * 2 + 1;
3173 my_current_left_closest[j] = coord;
3174 my_current_right_closest[j] = coord;
3177 mj_part_t kk = j + 1;
3178 while(kk < num_cuts){
3180 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3181 if(distance_to_cut < this->sEpsilon){
3182 my_current_part_weights[2 * kk + 1] += w;
3183 my_current_left_closest[kk] = coord;
3184 my_current_right_closest[kk] = coord;
3190 if(coord - my_current_left_closest[kk] > this->sEpsilon){
3191 my_current_left_closest[kk] = coord;
3201 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3202 if(distance_to_cut < this->sEpsilon){
3203 my_current_part_weights[2 * kk + 1] += w;
3205 this->assigned_part_ids[i] = kk * 2 + 1;
3206 my_current_left_closest[kk] = coord;
3207 my_current_right_closest[kk] = coord;
3213 if(my_current_right_closest[kk] - coord > this->sEpsilon){
3214 my_current_right_closest[kk] = coord;
3225 if (distance_to_cut < 0) {
3226 bool _break =
false;
3230 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3231 if(distance_to_next_cut > this->sEpsilon){
3237 upper_cut_index = j - 1;
3239 is_on_left_of_cut =
true;
3240 last_compared_part = j;
3245 bool _break =
false;
3246 if(j < num_cuts - 1){
3249 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3250 if(distance_to_next_cut < minus_EPSILON){
3257 lower_cut_index = j + 1;
3259 is_on_right_of_cut =
true;
3260 last_compared_part = j;
3265 j = (upper_cut_index + lower_cut_index) / 2;
3268 if(is_on_right_of_cut){
3271 my_current_part_weights[2 * last_compared_part + 2] += w;
3272 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3275 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3276 my_current_right_closest[last_compared_part] = coord;
3279 if(last_compared_part+1 < num_cuts){
3281 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3282 my_current_left_closest[last_compared_part + 1] = coord;
3287 else if(is_on_left_of_cut){
3290 my_current_part_weights[2 * last_compared_part] += w;
3291 this->assigned_part_ids[i] = 2 * last_compared_part;
3295 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3296 my_current_left_closest[last_compared_part] = coord;
3300 if(last_compared_part-1 >= 0){
3301 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3302 my_current_right_closest[last_compared_part -1] = coord;
3311 for (
size_t i = 1; i < total_part_count; ++i){
3315 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3316 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3321 my_current_part_weights[i] = my_current_part_weights[i-2];
3325 my_current_part_weights[i] += my_current_part_weights[i-1];
3337 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3339 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3340 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3341 mj_part_t current_work_part,
3342 mj_part_t current_concurrent_num_parts){
3344 #ifdef HAVE_ZOLTAN2_OMP
3351 size_t tlr_array_shift = 0;
3352 mj_part_t cut_shift = 0;
3355 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3357 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3358 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3359 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3362 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3363 mj_part_t next = tlr_array_shift + ii;
3364 mj_part_t cut_index = cut_shift + ii;
3365 if(this->is_cut_line_determined[cut_index])
continue;
3366 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3367 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3370 for (
int j = 1; j < this->num_threads; ++j){
3371 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3372 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3374 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3375 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3379 this->total_part_weight_left_right_closests[num_total_part_in_part +
3380 next] = left_closest_in_process;
3381 this->total_part_weight_left_right_closests[num_total_part_in_part +
3382 num_cuts_in_part + next] = right_closest_in_process;
3385 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3386 cut_shift += num_cuts_in_part;
3389 tlr_array_shift = 0;
3391 size_t total_part_array_shift = 0;
3394 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3396 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3397 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3398 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3400 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3402 mj_part_t cut_ind = j / 2 + cut_shift;
3407 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3409 for (
int k = 0; k < this->num_threads; ++k){
3410 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3413 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3415 cut_shift += num_cuts_in_part;
3416 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3417 total_part_array_shift += num_total_part_in_part;
3435 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3437 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3438 mj_scalar_t cut_upper_bound,
3439 mj_scalar_t cut_lower_bound,
3440 mj_scalar_t cut_upper_weight,
3441 mj_scalar_t cut_lower_weight,
3442 mj_scalar_t expected_weight,
3443 mj_scalar_t &new_cut_position){
3445 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3446 new_cut_position = cut_upper_bound;
3450 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3451 new_cut_position = cut_lower_bound;
3454 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3455 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3456 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3458 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3459 int scale_constant = 20;
3460 int shiftint= int (required_shift * scale_constant);
3461 if (shiftint == 0) shiftint = 1;
3462 required_shift = mj_scalar_t (shiftint) / scale_constant;
3463 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3477 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3479 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3480 mj_part_t num_parts,
3481 mj_scalar_t *mj_current_dim_coords,
3482 mj_scalar_t *current_concurrent_cut_coordinate,
3483 mj_lno_t coordinate_begin,
3484 mj_lno_t coordinate_end,
3485 mj_scalar_t *used_local_cut_line_weight_to_left,
3486 double **used_thread_part_weight_work,
3487 mj_lno_t *out_part_xadj){
3489 mj_part_t num_cuts = num_parts - 1;
3491 #ifdef HAVE_ZOLTAN2_OMP
3492 #pragma omp parallel
3496 #ifdef HAVE_ZOLTAN2_OMP
3497 me = omp_get_thread_num();
3500 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3501 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3505 if (this->distribute_points_on_cut_lines){
3506 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3508 #ifdef HAVE_ZOLTAN2_OMP
3511 for (mj_part_t i = 0; i < num_cuts; ++i){
3513 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3514 for(
int ii = 0; ii < this->num_threads; ++ii){
3515 if(left_weight > this->sEpsilon){
3517 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3518 if(thread_ii_weight_on_cut < left_weight){
3520 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3524 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3526 left_weight -= thread_ii_weight_on_cut;
3529 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3537 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3538 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3539 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3547 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3548 thread_num_points_in_parts[ii] = 0;
3552 #ifdef HAVE_ZOLTAN2_OMP
3556 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3558 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3559 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3560 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3561 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3562 if(coordinate_assigned_place % 2 == 1){
3564 if(this->distribute_points_on_cut_lines
3565 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3569 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3574 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3575 && coordinate_assigned_part < num_cuts - 1
3576 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3577 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3578 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3580 ++thread_num_points_in_parts[coordinate_assigned_part];
3581 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3585 ++coordinate_assigned_part;
3587 while(this->distribute_points_on_cut_lines &&
3588 coordinate_assigned_part < num_cuts){
3590 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3591 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3594 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3596 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3597 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3598 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3601 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3602 coordinate_assigned_part < num_cuts - 1 &&
3603 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3604 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3605 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3613 ++coordinate_assigned_part;
3615 ++thread_num_points_in_parts[coordinate_assigned_part];
3616 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3621 ++thread_num_points_in_parts[coordinate_assigned_part];
3622 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3631 #ifdef HAVE_ZOLTAN2_OMP
3634 for(mj_part_t j = 0; j < num_parts; ++j){
3635 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3636 for (
int i = 0; i < this->num_threads; ++i){
3637 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3639 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3640 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3643 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3647 #ifdef HAVE_ZOLTAN2_OMP
3652 for(mj_part_t j = 1; j < num_parts; ++j){
3653 out_part_xadj[j] += out_part_xadj[j - 1];
3659 for(mj_part_t j = 1; j < num_parts; ++j){
3660 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3666 #ifdef HAVE_ZOLTAN2_OMP
3669 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3670 mj_lno_t i = this->coordinate_permutations[ii];
3671 mj_part_t p = this->assigned_part_ids[i];
3672 this->new_coordinate_permutations[coordinate_begin +
3673 thread_num_points_in_parts[p]++] = i;
3708 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3710 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3711 const size_t &num_total_part,
3712 const mj_part_t &num_cuts,
3713 const mj_scalar_t &max_coordinate,
3714 const mj_scalar_t &min_coordinate,
3715 const mj_scalar_t &global_total_weight,
3716 const mj_scalar_t &used_imbalance_tolerance,
3717 mj_scalar_t * current_global_part_weights,
3718 const mj_scalar_t * current_local_part_weights,
3719 const mj_scalar_t *current_part_target_weights,
3720 bool *current_cut_line_determined,
3721 mj_scalar_t *current_cut_coordinates,
3722 mj_scalar_t *current_cut_upper_bounds,
3723 mj_scalar_t *current_cut_lower_bounds,
3724 mj_scalar_t *current_global_left_closest_points,
3725 mj_scalar_t *current_global_right_closest_points,
3726 mj_scalar_t * current_cut_lower_bound_weights,
3727 mj_scalar_t * current_cut_upper_weights,
3728 mj_scalar_t *new_current_cut_coordinates,
3729 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3730 mj_part_t *rectilinear_cut_count,
3731 mj_part_t &my_num_incomplete_cut){
3734 mj_scalar_t seen_weight_in_part = 0;
3736 mj_scalar_t expected_weight_in_part = 0;
3738 mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3741 #ifdef HAVE_ZOLTAN2_OMP
3744 for (mj_part_t i = 0; i < num_cuts; i++){
3747 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3748 current_global_left_closest_points[i] = current_cut_coordinates[i];
3749 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3750 current_global_right_closest_points[i] = current_cut_coordinates[i];
3753 #ifdef HAVE_ZOLTAN2_OMP
3756 for (mj_part_t i = 0; i < num_cuts; i++){
3758 if(this->distribute_points_on_cut_lines){
3760 this->global_rectilinear_cut_weight[i] = 0;
3761 this->process_rectilinear_cut_weight[i] = 0;
3765 if(current_cut_line_determined[i]) {
3766 new_current_cut_coordinates[i] = current_cut_coordinates[i];
3771 seen_weight_in_part = current_global_part_weights[i * 2];
3780 expected_weight_in_part = current_part_target_weights[i];
3782 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3784 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3786 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3787 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3790 if(is_left_imbalance_valid && is_right_imbalance_valid){
3791 current_cut_line_determined[i] =
true;
3792 #ifdef HAVE_ZOLTAN2_OMP
3795 my_num_incomplete_cut -= 1;
3796 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3799 else if(imbalance_on_left < 0){
3802 if(this->distribute_points_on_cut_lines){
3807 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3809 current_cut_line_determined[i] =
true;
3810 #ifdef HAVE_ZOLTAN2_OMP
3813 my_num_incomplete_cut -= 1;
3816 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3820 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3823 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3827 current_cut_line_determined[i] =
true;
3828 #ifdef HAVE_ZOLTAN2_OMP
3831 *rectilinear_cut_count += 1;
3834 #ifdef HAVE_ZOLTAN2_OMP
3837 my_num_incomplete_cut -= 1;
3838 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3839 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3840 current_local_part_weights[i * 2];
3845 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3847 current_cut_lower_bound_weights[i] = seen_weight_in_part;
3851 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3852 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3853 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3855 if(p_weight >= expected_weight_in_part){
3860 if(p_weight == expected_weight_in_part){
3861 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3862 current_cut_upper_weights[i] = p_weight;
3863 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3864 current_cut_lower_bound_weights[i] = p_weight;
3865 }
else if (p_weight < current_cut_upper_weights[i]){
3868 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3869 current_cut_upper_weights[i] = p_weight;
3875 if(line_weight >= expected_weight_in_part){
3878 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3879 current_cut_upper_weights[i] = line_weight;
3880 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3881 current_cut_lower_bound_weights[i] = p_weight;
3886 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3887 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3888 current_cut_lower_bound_weights[i] = p_weight;
3893 mj_scalar_t new_cut_position = 0;
3894 this->mj_calculate_new_cut_position(
3895 current_cut_upper_bounds[i],
3896 current_cut_lower_bounds[i],
3897 current_cut_upper_weights[i],
3898 current_cut_lower_bound_weights[i],
3899 expected_weight_in_part, new_cut_position);
3903 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3906 current_cut_line_determined[i] =
true;
3907 #ifdef HAVE_ZOLTAN2_OMP
3910 my_num_incomplete_cut -= 1;
3913 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3915 new_current_cut_coordinates [i] = new_cut_position;
3921 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3922 current_cut_upper_weights[i] = seen_weight_in_part;
3925 for (
int ii = i - 1; ii >= 0; --ii){
3926 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3927 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3928 if(p_weight <= expected_weight_in_part){
3929 if(p_weight == expected_weight_in_part){
3932 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3933 current_cut_upper_weights[i] = p_weight;
3934 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3935 current_cut_lower_bound_weights[i] = p_weight;
3937 else if (p_weight > current_cut_lower_bound_weights[i]){
3940 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3941 current_cut_lower_bound_weights[i] = p_weight;
3947 if(line_weight > expected_weight_in_part){
3948 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3949 current_cut_upper_weights[i] = line_weight;
3958 if (p_weight >= expected_weight_in_part &&
3959 (p_weight < current_cut_upper_weights[i] ||
3960 (p_weight == current_cut_upper_weights[i] &&
3961 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3965 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3966 current_cut_upper_weights[i] = p_weight;
3969 mj_scalar_t new_cut_position = 0;
3970 this->mj_calculate_new_cut_position(
3971 current_cut_upper_bounds[i],
3972 current_cut_lower_bounds[i],
3973 current_cut_upper_weights[i],
3974 current_cut_lower_bound_weights[i],
3975 expected_weight_in_part,
3979 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3981 current_cut_line_determined[i] =
true;
3982 #ifdef HAVE_ZOLTAN2_OMP
3985 my_num_incomplete_cut -= 1;
3987 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3989 new_current_cut_coordinates [ i] = new_cut_position;
3998 #ifdef HAVE_ZOLTAN2_OMP
4003 if(*rectilinear_cut_count > 0){
4006 Teuchos::scan<int,mj_scalar_t>(
4007 *comm, Teuchos::REDUCE_SUM,
4009 this->process_rectilinear_cut_weight,
4010 this->global_rectilinear_cut_weight
4015 for (mj_part_t i = 0; i < num_cuts; ++i){
4017 if(this->global_rectilinear_cut_weight[i] > 0) {
4019 mj_scalar_t expected_part_weight = current_part_target_weights[i];
4021 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4023 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4025 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4028 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4030 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4040 if(space_left_to_me < 0){
4042 current_part_cut_line_weight_to_put_left[i] = 0;
4044 else if(space_left_to_me >= my_weight_on_line){
4047 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4052 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4059 *rectilinear_cut_count = 0;
4074 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4077 mj_part_t num_procs,
4078 mj_part_t num_parts,
4079 mj_gno_t *&num_points_in_all_processor_parts){
4082 size_t allocation_size = num_parts * (num_procs + 1);
4089 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4094 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4097 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4100 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
4103 for (mj_part_t i = 0; i < num_parts; ++i){
4104 mj_lno_t part_begin_index = 0;
4106 part_begin_index = this->new_part_xadj[i - 1];
4108 mj_lno_t part_end_index = this->new_part_xadj[i];
4109 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4114 memcpy (my_local_point_counts_in_each_art,
4115 my_local_points_to_reduce_sum,
4116 sizeof(mj_gno_t) * (num_parts) );
4124 reduceAll<int, mj_gno_t>(
4126 Teuchos::REDUCE_SUM,
4128 num_local_points_in_each_part_to_reduce_sum,
4129 num_points_in_all_processor_parts);
4132 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4149 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4151 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4152 size_t migration_reduce_all_population,
4153 mj_lno_t num_coords_for_last_dim_part,
4154 mj_part_t num_procs,
4155 mj_part_t num_parts,
4156 mj_gno_t *num_points_in_all_processor_parts){
4164 if (this->check_migrate_avoid_migration_option == 0){
4165 double global_imbalance = 0;
4167 size_t global_shift = num_procs * num_parts;
4169 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4170 for (mj_part_t i = 0; i < num_parts; ++i){
4171 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4172 / double(num_procs);
4175 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4178 global_imbalance /= num_parts;
4179 global_imbalance /= num_procs;
4187 if(global_imbalance <= this->minimum_migration_imbalance){
4210 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4212 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4213 mj_part_t num_parts,
4214 mj_part_t *part_assignment_proc_begin_indices,
4215 mj_part_t *processor_chains_in_parts,
4216 mj_lno_t *send_count_to_each_proc,
4217 int *coordinate_destinations){
4219 for (mj_part_t p = 0; p < num_parts; ++p){
4220 mj_lno_t part_begin = 0;
4221 if (p > 0) part_begin = this->new_part_xadj[p - 1];
4222 mj_lno_t part_end = this->new_part_xadj[p];
4225 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4227 mj_lno_t num_total_send = 0;
4228 for (mj_lno_t j=part_begin; j < part_end; j++){
4229 mj_lno_t local_ind = this->new_coordinate_permutations[j];
4230 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4234 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4236 processor_chains_in_parts[proc_to_sent] = -1;
4238 proc_to_sent = part_assignment_proc_begin_indices[p];
4241 coordinate_destinations[local_ind] = proc_to_sent;
4261 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4263 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4264 mj_gno_t * num_points_in_all_processor_parts,
4265 mj_part_t num_parts,
4266 mj_part_t num_procs,
4267 mj_lno_t *send_count_to_each_proc,
4268 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4269 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4270 mj_part_t &out_part_index,
4271 mj_part_t &output_part_numbering_begin_index,
4272 int *coordinate_destinations){
4275 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4276 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4279 bool did_i_find_my_group =
false;
4281 mj_part_t num_free_procs = num_procs;
4282 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4284 double max_imbalance_difference = 0;
4285 mj_part_t max_differing_part = 0;
4288 for (mj_part_t i=0; i < num_parts; i++){
4291 double scalar_required_proc = num_procs *
4292 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4295 mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
4299 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4300 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4304 num_free_procs -= required_proc;
4306 --minimum_num_procs_required_for_rest_of_parts;
4309 num_procs_assigned_to_each_part[i] = required_proc;
4314 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4315 if (imbalance_wrt_ideal > max_imbalance_difference){
4316 max_imbalance_difference = imbalance_wrt_ideal;
4317 max_differing_part = i;
4322 if (num_free_procs > 0){
4323 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4330 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4332 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4333 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4342 for (
int i = 0; i < num_procs; ++i ){
4343 processor_part_assignments[i] = -1;
4344 processor_chains_in_parts[i] = -1;
4346 for (
int i = 0; i < num_parts; ++i ){
4347 part_assignment_proc_begin_indices[i] = -1;
4353 uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4354 for(mj_part_t i = 0; i < num_parts; ++i){
4358 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4359 sort_item_num_part_points_in_procs[ii].id = ii;
4362 if (processor_part_assignments[ii] == -1){
4363 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4364 sort_item_num_part_points_in_procs[ii].signbit = 1;
4374 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4375 sort_item_num_part_points_in_procs[ii].signbit = 0;
4379 uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4389 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4390 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4391 mj_gno_t ideal_num_points_in_a_proc =
4392 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4395 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4396 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4397 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4401 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4402 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4404 processor_part_assignments[proc_id] = i;
4407 bool did_change_sign =
false;
4409 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4412 if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4413 did_change_sign =
true;
4414 sort_item_num_part_points_in_procs[ii].signbit = 1;
4420 if(did_change_sign){
4422 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4434 if (!did_i_find_my_group){
4435 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4437 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4439 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4441 if(proc_id_to_assign == this->myRank){
4443 did_i_find_my_group =
true;
4445 part_assignment_proc_begin_indices[i] = this->myRank;
4446 processor_chains_in_parts[this->myRank] = -1;
4449 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4452 for (mj_part_t in = 0; in < i; ++in){
4453 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4460 if (!did_i_find_my_group){
4461 processor_ranks_for_subcomm.clear();
4469 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4470 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4471 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4476 if (num_points_to_sent < 0) {
4477 std::cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4482 switch (migration_type){
4486 while (num_points_to_sent > 0){
4488 if (num_points_to_sent <= space_left_in_sent_proc){
4490 space_left_in_sent_proc -= num_points_to_sent;
4492 if (this->myRank == nonassigned_proc_id){
4494 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4497 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4498 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4499 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4501 num_points_to_sent = 0;
4505 if(space_left_in_sent_proc > 0){
4506 num_points_to_sent -= space_left_in_sent_proc;
4509 if (this->myRank == nonassigned_proc_id){
4511 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4512 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4513 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4514 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4519 ++next_proc_to_send_index;
4522 if(next_part_to_send_index < nprocs - required_proc_count ){
4523 std::cout <<
"Migration - processor assignments - for part:"
4525 <<
" next_part_to_send :" << next_part_to_send_index
4526 <<
" nprocs:" << nprocs
4527 <<
" required_proc_count:" << required_proc_count
4528 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4534 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4536 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4545 if (this->myRank == nonassigned_proc_id){
4547 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4550 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4551 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4552 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4554 num_points_to_sent = 0;
4555 ++next_proc_to_send_index;
4558 if (next_proc_to_send_index == num_procs){
4559 next_proc_to_send_index = num_procs - required_proc_count;
4562 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4564 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4577 this->assign_send_destinations(
4579 part_assignment_proc_begin_indices,
4580 processor_chains_in_parts,
4581 send_count_to_each_proc,
4582 coordinate_destinations);
4584 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4585 freeArray<mj_part_t>(processor_chains_in_parts);
4586 freeArray<mj_part_t>(processor_part_assignments);
4587 freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4588 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4605 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4607 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4608 mj_part_t num_parts,
4609 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
4610 int *coordinate_destinations,
4611 mj_part_t &output_part_numbering_begin_index,
4612 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4614 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4615 mj_part_t previous_processor = -1;
4616 for(mj_part_t i = 0; i < num_parts; ++i){
4617 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4619 mj_lno_t part_begin_index = 0;
4620 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4621 mj_lno_t part_end_index = this->new_part_xadj[p];
4623 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4624 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4625 output_part_numbering_begin_index = part_shift_amount;
4627 previous_processor = assigned_proc;
4628 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4630 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4631 mj_lno_t localInd = this->new_coordinate_permutations[j];
4632 coordinate_destinations[localInd] = assigned_proc;
4654 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4656 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4657 mj_gno_t * num_points_in_all_processor_parts,
4658 mj_part_t num_parts,
4659 mj_part_t num_procs,
4660 mj_lno_t *send_count_to_each_proc,
4661 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4662 mj_part_t &out_num_part,
4663 std::vector<mj_part_t> &out_part_indices,
4664 mj_part_t &output_part_numbering_begin_index,
4665 int *coordinate_destinations){
4668 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4669 out_part_indices.clear();
4673 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4674 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4678 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4680 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4682 for (mj_part_t i = 0; i < num_procs; ++i){
4683 space_in_each_processor[i] = work_each;
4690 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4691 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4692 int empty_proc_count = num_procs;
4696 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4700 for (mj_part_t i = 0; i < num_parts; ++i){
4701 sort_item_point_counts_in_parts[i].id = i;
4702 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4705 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4711 for (mj_part_t j = 0; j < num_parts; ++j){
4713 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4715 mj_gno_t load = global_num_points_in_parts[i];
4718 mj_part_t assigned_proc = -1;
4720 mj_part_t best_proc_to_assign = 0;
4724 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4725 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4730 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4732 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4735 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4738 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4741 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4742 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4743 mj_lno_t left_space = space_in_each_processor[ii] - load;
4745 if(left_space >= 0 ){
4750 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4751 best_proc_to_assign = ii;
4756 if (assigned_proc == -1){
4757 assigned_proc = best_proc_to_assign;
4760 if (num_parts_proc_assigned[assigned_proc]++ == 0){
4763 space_in_each_processor[assigned_proc] -= load;
4765 sort_item_part_to_proc_assignment[j].id = i;
4766 sort_item_part_to_proc_assignment[j].val = assigned_proc;
4770 if (assigned_proc == this->myRank){
4772 out_part_indices.push_back(i);
4776 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4778 freeArray<mj_part_t>(num_parts_proc_assigned);
4779 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4780 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4781 freeArray<mj_lno_t >(space_in_each_processor);
4785 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4789 this->assign_send_destinations2(
4791 sort_item_part_to_proc_assignment,
4792 coordinate_destinations,
4793 output_part_numbering_begin_index,
4794 next_future_num_parts_in_parts);
4796 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4817 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4819 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
4820 mj_gno_t * num_points_in_all_processor_parts,
4821 mj_part_t num_parts,
4822 mj_part_t num_procs,
4823 mj_lno_t *send_count_to_each_proc,
4824 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4825 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4826 mj_part_t &out_num_part,
4827 std::vector<mj_part_t> &out_part_indices,
4828 mj_part_t &output_part_numbering_begin_index,
4829 int *coordinate_destinations){
4833 processor_ranks_for_subcomm.clear();
4835 if (num_procs > num_parts){
4840 mj_part_t out_part_index = 0;
4841 this->mj_assign_proc_to_parts(
4842 num_points_in_all_processor_parts,
4845 send_count_to_each_proc,
4846 processor_ranks_for_subcomm,
4847 next_future_num_parts_in_parts,
4849 output_part_numbering_begin_index,
4850 coordinate_destinations
4854 out_part_indices.clear();
4855 out_part_indices.push_back(out_part_index);
4862 processor_ranks_for_subcomm.push_back(this->myRank);
4866 this->mj_assign_parts_to_procs(
4867 num_points_in_all_processor_parts,
4870 send_count_to_each_proc,
4871 next_future_num_parts_in_parts,
4874 output_part_numbering_begin_index,
4875 coordinate_destinations);
4891 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4893 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
4894 mj_part_t num_procs,
4895 mj_lno_t &num_new_local_points,
4896 std::string iteration,
4897 int *coordinate_destinations,
4898 mj_part_t num_parts)
4900 #ifdef ENABLE_ZOLTAN_MIGRATION
4901 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
4907 ZOLTAN_COMM_OBJ *plan = NULL;
4908 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4909 int num_incoming_gnos = 0;
4910 int message_tag = 7859;
4912 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4913 int ierr = Zoltan_Comm_Create(
4915 int(this->num_local_coords),
4916 coordinate_destinations,
4919 &num_incoming_gnos);
4921 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4923 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4924 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4928 ierr = Zoltan_Comm_Do(
4931 (
char *) this->current_mj_gnos,
4933 (
char *) incoming_gnos);
4936 freeArray<mj_gno_t>(this->current_mj_gnos);
4937 this->current_mj_gnos = incoming_gnos;
4941 for (
int i = 0; i < this->coord_dim; ++i){
4943 mj_scalar_t *coord = this->mj_coordinates[i];
4945 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4946 ierr = Zoltan_Comm_Do(
4950 sizeof(mj_scalar_t),
4951 (
char *) this->mj_coordinates[i]);
4953 freeArray<mj_scalar_t>(coord);
4957 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4959 mj_scalar_t *weight = this->mj_weights[i];
4961 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4962 ierr = Zoltan_Comm_Do(
4966 sizeof(mj_scalar_t),
4967 (
char *) this->mj_weights[i]);
4969 freeArray<mj_scalar_t>(weight);
4974 int *coord_own = allocMemory<int>(num_incoming_gnos);
4976 ierr = Zoltan_Comm_Do(
4979 (
char *) this->owner_of_coordinate,
4980 sizeof(
int), (
char *) coord_own);
4982 freeArray<int>(this->owner_of_coordinate);
4983 this->owner_of_coordinate = coord_own;
4989 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4990 if(num_procs < num_parts){
4992 ierr = Zoltan_Comm_Do(
4995 (
char *) this->assigned_part_ids,
4997 (
char *) new_parts);
5000 freeArray<mj_part_t>(this->assigned_part_ids);
5001 this->assigned_part_ids = new_parts;
5003 ierr = Zoltan_Comm_Destroy(&plan);
5005 num_new_local_points = num_incoming_gnos;
5006 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
5011 #endif // ENABLE_ZOLTAN_MIGRATION
5013 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5014 Tpetra::Distributor distributor(this->comm);
5015 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5016 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5017 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5019 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5022 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5023 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5024 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5025 freeArray<mj_gno_t>(this->current_mj_gnos);
5026 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5028 this->current_mj_gnos,
5029 received_gnos.getRawPtr(),
5030 num_incoming_gnos *
sizeof(mj_gno_t));
5033 for (
int i = 0; i < this->coord_dim; ++i){
5035 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5036 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5037 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5038 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5039 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5041 this->mj_coordinates[i],
5042 received_coord.getRawPtr(),
5043 num_incoming_gnos *
sizeof(mj_scalar_t));
5047 for (
int i = 0; i < this->num_weights_per_coord; ++i){
5049 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5050 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5051 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5052 freeArray<mj_scalar_t>(this->mj_weights[i]);
5053 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5055 this->mj_weights[i],
5056 received_weight.getRawPtr(),
5057 num_incoming_gnos *
sizeof(mj_scalar_t));
5062 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5063 ArrayRCP<int> received_owners(num_incoming_gnos);
5064 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
5065 freeArray<int>(this->owner_of_coordinate);
5066 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5068 this->owner_of_coordinate,
5069 received_owners.getRawPtr(),
5070 num_incoming_gnos *
sizeof(int));
5076 if(num_procs < num_parts){
5077 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5078 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5079 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5080 freeArray<mj_part_t>(this->assigned_part_ids);
5081 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5083 this->assigned_part_ids,
5084 received_partids.getRawPtr(),
5085 num_incoming_gnos *
sizeof(mj_part_t));
5088 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5089 freeArray<mj_part_t>(this->assigned_part_ids);
5090 this->assigned_part_ids = new_parts;
5092 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5093 num_new_local_points = num_incoming_gnos;
5104 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5107 mj_part_t group_size = processor_ranks_for_subcomm.size();
5108 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5109 for(mj_part_t i = 0; i < group_size; ++i) {
5110 ids[i] = processor_ranks_for_subcomm[i];
5112 ArrayView<const mj_part_t> idView(ids, group_size);
5113 this->comm = this->comm->createSubcommunicator(idView);
5114 freeArray<mj_part_t>(ids);
5123 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5125 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5126 mj_part_t output_num_parts,
5127 mj_part_t num_parts){
5129 if (output_num_parts == 1){
5130 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5131 this->new_coordinate_permutations[i] = i;
5133 this->new_part_xadj[0] = this->num_local_coords;
5140 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5142 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5144 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
5146 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5147 mj_part_t ii = this->assigned_part_ids[i];
5148 ++num_points_in_parts[ii];
5153 mj_lno_t prev_index = 0;
5154 for(mj_part_t i = 0; i < num_parts; ++i){
5155 if(num_points_in_parts[i] > 0) {
5156 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5157 prev_index += num_points_in_parts[i];
5158 part_shifts[i] = p++;
5163 mj_part_t assigned_num_parts = p - 1;
5164 for (;p < num_parts; ++p){
5165 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5167 for(mj_part_t i = 0; i < output_num_parts; ++i){
5168 num_points_in_parts[i] = this->new_part_xadj[i];
5174 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5175 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5176 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5179 freeArray<mj_lno_t>(num_points_in_parts);
5180 freeArray<mj_part_t>(part_shifts);
5207 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5209 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5210 mj_part_t input_num_parts,
5211 mj_part_t &output_num_parts,
5212 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5213 mj_part_t &output_part_begin_index,
5214 size_t migration_reduce_all_population,
5215 mj_lno_t num_coords_for_last_dim_part,
5216 std::string iteration,
5217 RCP<mj_partBoxVector_t> &input_part_boxes,
5218 RCP<mj_partBoxVector_t> &output_part_boxes
5221 mj_part_t num_procs = this->comm->getSize();
5222 this->myRank = this->comm->getRank();
5228 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5231 this->get_processor_num_points_in_parts(
5234 num_points_in_all_processor_parts);
5238 if (!this->mj_check_to_migrate(
5239 migration_reduce_all_population,
5240 num_coords_for_last_dim_part,
5243 num_points_in_all_processor_parts)){
5244 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5249 mj_lno_t *send_count_to_each_proc = NULL;
5250 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5251 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5252 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5254 std::vector<mj_part_t> processor_ranks_for_subcomm;
5255 std::vector<mj_part_t> out_part_indices;
5258 this->mj_migration_part_proc_assignment(
5259 num_points_in_all_processor_parts,
5262 send_count_to_each_proc,
5263 processor_ranks_for_subcomm,
5264 next_future_num_parts_in_parts,
5267 output_part_begin_index,
5268 coordinate_destinations);
5273 freeArray<mj_lno_t>(send_count_to_each_proc);
5274 std::vector <mj_part_t> tmpv;
5276 std::sort (out_part_indices.begin(), out_part_indices.end());
5277 mj_part_t outP = out_part_indices.size();
5279 mj_gno_t new_global_num_points = 0;
5280 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5282 if (this->mj_keep_part_boxes){
5283 input_part_boxes->clear();
5288 for (mj_part_t i = 0; i < outP; ++i){
5289 mj_part_t ind = out_part_indices[i];
5290 new_global_num_points += global_num_points_in_parts[ind];
5291 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5292 if (this->mj_keep_part_boxes){
5293 input_part_boxes->push_back((*output_part_boxes)[ind]);
5297 if (this->mj_keep_part_boxes){
5298 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5299 input_part_boxes = output_part_boxes;
5300 output_part_boxes = tmpPartBoxes;
5302 next_future_num_parts_in_parts->clear();
5303 for (mj_part_t i = 0; i < outP; ++i){
5304 mj_part_t p = tmpv[i];
5305 next_future_num_parts_in_parts->push_back(p);
5308 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5310 mj_lno_t num_new_local_points = 0;
5314 this->mj_migrate_coords(
5316 num_new_local_points,
5318 coordinate_destinations,
5322 freeArray<int>(coordinate_destinations);
5324 if(this->num_local_coords != num_new_local_points){
5325 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5326 freeArray<mj_lno_t>(this->coordinate_permutations);
5328 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5329 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5331 this->num_local_coords = num_new_local_points;
5332 this->num_global_coords = new_global_num_points;
5337 this->create_sub_communicator(processor_ranks_for_subcomm);
5338 processor_ranks_for_subcomm.clear();
5341 this->fill_permutation_array(
5361 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5363 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5364 mj_part_t num_parts,
5365 mj_scalar_t *mj_current_dim_coords,
5366 mj_scalar_t *current_concurrent_cut_coordinate,
5367 mj_lno_t coordinate_begin,
5368 mj_lno_t coordinate_end,
5369 mj_scalar_t *used_local_cut_line_weight_to_left,
5370 mj_lno_t *out_part_xadj,
5371 int coordInd,
bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5374 mj_part_t no_cuts = num_parts - 1;
5379 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5380 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5385 if (this->distribute_points_on_cut_lines){
5387 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5388 for (mj_part_t i = 0; i < no_cuts; ++i){
5390 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5392 for(
int ii = 0; ii < this->num_threads; ++ii){
5393 if(left_weight > this->sEpsilon){
5395 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5396 if(thread_ii_weight_on_cut < left_weight){
5397 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5400 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5402 left_weight -= thread_ii_weight_on_cut;
5405 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5413 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5414 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5415 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5423 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5424 thread_num_points_in_parts[ii] = 0;
5436 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5439 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5440 typedef std::vector< multiSItem > multiSVector;
5441 typedef std::vector<multiSVector> multiS2Vector;
5444 std::vector<mj_scalar_t *>allocated_memory;
5447 multiS2Vector sort_vector_points_on_cut;
5450 mj_part_t different_cut_count = 1;
5455 multiSVector tmpMultiSVector;
5456 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5458 for (mj_part_t i = 1; i < no_cuts ; ++i){
5461 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5462 cut_map[i] = cut_map[i-1];
5465 cut_map[i] = different_cut_count++;
5466 multiSVector tmp2MultiSVector;
5467 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5473 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5475 mj_lno_t i = this->coordinate_permutations[ii];
5477 mj_part_t pp = this->assigned_part_ids[i];
5478 mj_part_t p = pp / 2;
5481 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5482 allocated_memory.push_back(
vals);
5487 if (longest_dim_part){
5489 for(
int dim = this->coord_dim - 2; dim >= 0; --dim){
5491 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5493 vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5497 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5498 vals[val_ind++] = this->mj_coordinates[dim][i];
5500 for(
int dim = 0; dim < coordInd; ++dim){
5501 vals[val_ind++] = this->mj_coordinates[dim][i];
5504 multiSItem tempSortItem(i, this->coord_dim -1,
vals);
5506 mj_part_t cmap = cut_map[p];
5507 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5511 ++thread_num_points_in_parts[p];
5512 this->assigned_part_ids[i] = p;
5517 for (mj_part_t i = 0; i < different_cut_count; ++i){
5518 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5522 mj_part_t previous_cut_map = cut_map[0];
5531 mj_scalar_t weight_stolen_from_previous_part = 0;
5532 for (mj_part_t p = 0; p < no_cuts; ++p){
5534 mj_part_t mapped_cut = cut_map[p];
5538 if (previous_cut_map != mapped_cut){
5539 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5540 for (; sort_vector_end >= 0; --sort_vector_end){
5541 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5542 mj_lno_t i = t.index;
5543 ++thread_num_points_in_parts[p];
5544 this->assigned_part_ids[i] = p;
5546 sort_vector_points_on_cut[previous_cut_map].clear();
5550 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5555 for (; sort_vector_end >= 0; --sort_vector_end){
5558 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5560 mj_lno_t i = t.index;
5561 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5565 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5566 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5569 my_local_thread_cut_weights_to_put_left[p] -= w;
5570 sort_vector_points_on_cut[mapped_cut].pop_back();
5571 ++thread_num_points_in_parts[p];
5572 this->assigned_part_ids[i] = p;
5575 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5576 if(mapped_cut == cut_map[p + 1] ){
5579 if (previous_cut_map != mapped_cut){
5580 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5585 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5589 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5598 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5599 if (previous_cut_map != mapped_cut){
5600 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5603 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5607 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5613 previous_cut_map = mapped_cut;
5618 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5623 for (; sort_vector_end >= 0; --sort_vector_end){
5626 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5628 mj_lno_t i = t.index;
5629 ++thread_num_points_in_parts[no_cuts];
5630 this->assigned_part_ids[i] = no_cuts;
5632 sort_vector_points_on_cut[previous_cut_map].clear();
5633 freeArray<mj_part_t> (cut_map);
5636 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5637 for(mj_lno_t i = 0; i < vSize; ++i){
5638 freeArray<mj_scalar_t> (allocated_memory[i]);
5642 for(mj_part_t j = 0; j < num_parts; ++j){
5643 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5644 for (
int i = 0; i < this->num_threads; ++i){
5645 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5646 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5647 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5650 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5654 for(mj_part_t j = 1; j < num_parts; ++j){
5655 out_part_xadj[j] += out_part_xadj[j - 1];
5661 for(mj_part_t j = 1; j < num_parts; ++j){
5662 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5667 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5668 mj_lno_t i = this->coordinate_permutations[ii];
5669 mj_part_t p = this->assigned_part_ids[i];
5670 this->new_coordinate_permutations[coordinate_begin +
5671 thread_num_points_in_parts[p]++] = i;
5686 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5688 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5689 mj_part_t current_num_parts,
5690 mj_part_t output_part_begin_index,
5691 RCP<mj_partBoxVector_t> &output_part_boxes,
5692 bool is_data_ever_migrated)
5694 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5696 #ifdef HAVE_ZOLTAN2_OMP
5697 #pragma omp parallel for
5699 for(mj_part_t i = 0; i < current_num_parts;++i){
5702 mj_lno_t end = this->part_xadj[i];
5704 if(i > 0) begin = this->part_xadj[i -1];
5705 mj_part_t part_to_set_index = i + output_part_begin_index;
5706 if (this->mj_keep_part_boxes){
5707 (*output_part_boxes)[i].setpId(part_to_set_index);
5709 for (mj_lno_t ii = begin; ii < end; ++ii){
5710 mj_lno_t k = this->coordinate_permutations[ii];
5711 this->assigned_part_ids[k] = part_to_set_index;
5716 if(!is_data_ever_migrated){
5723 #ifdef ENABLE_ZOLTAN_MIGRATION
5724 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5731 ZOLTAN_COMM_OBJ *plan = NULL;
5732 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5735 int message_tag = 7856;
5737 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5738 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5739 this->owner_of_coordinate, mpi_comm, message_tag,
5742 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5744 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5747 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5748 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
5749 sizeof(mj_gno_t), (
char *) incoming_gnos);
5752 freeArray<mj_gno_t>(this->current_mj_gnos);
5753 this->current_mj_gnos = incoming_gnos;
5755 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5758 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
5759 sizeof(mj_part_t), (
char *) incoming_partIds);
5761 freeArray<mj_part_t>(this->assigned_part_ids);
5762 this->assigned_part_ids = incoming_partIds;
5764 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5765 ierr = Zoltan_Comm_Destroy(&plan);
5768 this->num_local_coords = incoming;
5773 #endif // !ENABLE_ZOLTAN_MIGRATION
5776 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
5777 Tpetra::Distributor distributor(this->mj_problemComm);
5778 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5779 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5780 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
5782 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5784 ArrayRCP<mj_gno_t> received_gnos(incoming);
5785 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5786 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5787 freeArray<mj_gno_t>(this->current_mj_gnos);
5788 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5789 memcpy( this->current_mj_gnos,
5790 received_gnos.getRawPtr(),
5791 incoming *
sizeof(mj_gno_t));
5794 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5795 ArrayRCP<mj_part_t> received_partids(incoming);
5796 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5797 freeArray<mj_part_t>(this->assigned_part_ids);
5798 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5799 memcpy( this->assigned_part_ids,
5800 received_partids.getRawPtr(),
5801 incoming *
sizeof(mj_part_t));
5803 this->num_local_coords = incoming;
5804 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5809 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5811 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5816 if (this->mj_keep_part_boxes){
5817 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5821 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5826 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5828 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
5829 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5831 for (
int i=0; i < this->coord_dim; i++){
5832 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5834 freeArray<mj_scalar_t *>(this->mj_coordinates);
5836 for (
int i=0; i < this->num_weights_per_coord; i++){
5837 freeArray<mj_scalar_t>(this->mj_weights[i]);
5839 freeArray<mj_scalar_t *>(this->mj_weights);
5841 freeArray<int>(this->owner_of_coordinate);
5843 for(
int i = 0; i < this->num_threads; ++i){
5844 freeArray<mj_lno_t>(this->thread_point_counts[i]);
5847 freeArray<mj_lno_t *>(this->thread_point_counts);
5848 freeArray<double *> (this->thread_part_weight_work);
5850 if(this->distribute_points_on_cut_lines){
5851 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5852 for(
int i = 0; i < this->num_threads; ++i){
5853 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5855 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5856 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5857 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5860 freeArray<mj_part_t>(this->my_incomplete_cut_count);
5862 freeArray<mj_scalar_t>(this->max_min_coords);
5864 freeArray<mj_lno_t>(this->part_xadj);
5866 freeArray<mj_lno_t>(this->coordinate_permutations);
5868 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5870 freeArray<mj_scalar_t>(this->all_cut_coordinates);
5872 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5874 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5876 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5878 freeArray<mj_scalar_t>(this->target_part_weights);
5880 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5882 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5884 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5885 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5886 freeArray<bool>(this->is_cut_line_determined);
5887 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5888 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5890 for(
int i = 0; i < this->num_threads; ++i){
5891 freeArray<double>(this->thread_part_weights[i]);
5892 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5893 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5896 freeArray<double *>(this->thread_part_weights);
5897 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5898 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5900 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5911 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5914 bool distribute_points_on_cut_lines_,
5915 int max_concurrent_part_calculation_,
5916 int check_migrate_avoid_migration_option_,
5917 mj_scalar_t minimum_migration_imbalance_,
5918 int migration_type_ ){
5919 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5920 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5921 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5922 this->minimum_migration_imbalance = minimum_migration_imbalance_;
5923 this->migration_type = migration_type_;
5958 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5962 const RCP<const Environment> &env,
5963 RCP<
const Comm<int> > &problemComm,
5965 double imbalance_tolerance_,
5966 size_t num_global_parts_,
5967 mj_part_t *part_no_array_,
5968 int recursion_depth_,
5971 mj_lno_t num_local_coords_,
5972 mj_gno_t num_global_coords_,
5973 const mj_gno_t *initial_mj_gnos_,
5974 mj_scalar_t **mj_coordinates_,
5976 int num_weights_per_coord_,
5977 bool *mj_uniform_weights_,
5978 mj_scalar_t **mj_weights_,
5979 bool *mj_uniform_parts_,
5980 mj_scalar_t **mj_part_sizes_,
5982 mj_part_t *&result_assigned_part_ids_,
5983 mj_gno_t *&result_mj_gnos_
5990 if(comm->getRank() == 0){
5991 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
5992 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
5993 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
5997 this->mj_problemComm = problemComm;
5998 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
6020 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
6021 this->mj_env->debug(3,
"In MultiJagged Jagged");
6024 this->imbalance_tolerance = imbalance_tolerance_;
6025 this->num_global_parts = num_global_parts_;
6026 this->part_no_array = part_no_array_;
6027 this->recursion_depth = recursion_depth_;
6029 this->coord_dim = coord_dim_;
6030 this->num_local_coords = num_local_coords_;
6031 this->num_global_coords = num_global_coords_;
6032 this->mj_coordinates = mj_coordinates_;
6033 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
6035 this->num_weights_per_coord = num_weights_per_coord_;
6036 this->mj_uniform_weights = mj_uniform_weights_;
6037 this->mj_weights = mj_weights_;
6038 this->mj_uniform_parts = mj_uniform_parts_;
6039 this->mj_part_sizes = mj_part_sizes_;
6041 this->num_threads = 1;
6042 #ifdef HAVE_ZOLTAN2_OMP
6043 #pragma omp parallel
6046 this->num_threads = omp_get_num_threads();
6051 this->set_part_specifications();
6053 this->allocate_set_work_memory();
6057 this->comm = this->mj_problemComm->duplicate();
6060 mj_part_t current_num_parts = 1;
6061 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6063 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6065 mj_part_t output_part_begin_index = 0;
6066 mj_part_t future_num_parts = this->total_num_part;
6067 bool is_data_ever_migrated =
false;
6069 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
6070 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
6071 next_future_num_parts_in_parts->push_back(this->num_global_parts);
6073 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
6074 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
6076 compute_global_box();
6077 if(this->mj_keep_part_boxes){
6078 this->init_part_boxes(output_part_boxes);
6081 for (
int i = 0; i < this->recursion_depth; ++i){
6084 std::vector <mj_part_t> num_partitioning_in_current_dim;
6098 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6099 future_num_part_in_parts = next_future_num_parts_in_parts;
6100 next_future_num_parts_in_parts = tmpPartVect;
6105 next_future_num_parts_in_parts->clear();
6107 if(this->mj_keep_part_boxes){
6108 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6109 input_part_boxes = output_part_boxes;
6110 output_part_boxes = tmpPartBoxes;
6111 output_part_boxes->clear();
6115 mj_part_t output_part_count_in_dimension =
6116 this->update_part_num_arrays(
6117 num_partitioning_in_current_dim,
6118 future_num_part_in_parts,
6119 next_future_num_parts_in_parts,
6124 output_part_boxes, 1);
6129 if(output_part_count_in_dimension == current_num_parts) {
6131 tmpPartVect= future_num_part_in_parts;
6132 future_num_part_in_parts = next_future_num_parts_in_parts;
6133 next_future_num_parts_in_parts = tmpPartVect;
6135 if(this->mj_keep_part_boxes){
6136 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6137 input_part_boxes = output_part_boxes;
6138 output_part_boxes = tmpPartBoxes;
6145 int coordInd = i % this->coord_dim;
6146 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6149 std::string istring = Teuchos::toString<int>(i);
6150 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6154 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6157 mj_part_t output_part_index = 0;
6160 mj_part_t output_coordinate_end_index = 0;
6162 mj_part_t current_work_part = 0;
6163 mj_part_t current_concurrent_num_parts =
6164 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6166 mj_part_t obtained_part_index = 0;
6169 for (; current_work_part < current_num_parts;
6170 current_work_part += current_concurrent_num_parts){
6172 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6173 this->max_concurrent_part_calculation);
6175 mj_part_t actual_work_part_count = 0;
6179 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6180 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6184 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6187 ++actual_work_part_count;
6188 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6189 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
6197 this->mj_get_local_min_max_coord_totW(
6198 coordinate_begin_index,
6199 coordinate_end_index,
6200 this->coordinate_permutations,
6201 mj_current_dim_coords,
6202 this->process_local_min_max_coord_total_weight[kk],
6203 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
6204 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
6209 if (actual_work_part_count > 0){
6211 this->mj_get_global_min_max_coord_totW(
6212 current_concurrent_num_parts,
6213 this->process_local_min_max_coord_total_weight,
6214 this->global_min_max_coord_total_weight);
6218 mj_part_t total_incomplete_cut_count = 0;
6223 mj_part_t concurrent_part_cut_shift = 0;
6224 mj_part_t concurrent_part_part_shift = 0;
6225 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6226 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6227 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6228 current_concurrent_num_parts];
6230 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6231 2 * current_concurrent_num_parts];
6233 mj_part_t concurrent_current_part_index = current_work_part + kk;
6235 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6237 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6238 mj_scalar_t *current_target_part_weights = this->target_part_weights +
6239 concurrent_part_part_shift;
6241 concurrent_part_cut_shift += partition_count - 1;
6243 concurrent_part_part_shift += partition_count;
6248 if(partition_count > 1 && min_coordinate <= max_coordinate){
6252 total_incomplete_cut_count += partition_count - 1;
6255 this->my_incomplete_cut_count[kk] = partition_count - 1;
6258 this->mj_get_initial_cut_coords_target_weights(
6261 partition_count - 1,
6262 global_total_weight,
6264 current_target_part_weights,
6265 future_num_part_in_parts,
6266 next_future_num_parts_in_parts,
6267 concurrent_current_part_index,
6268 obtained_part_index);
6270 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6271 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
6275 this->set_initial_coordinate_parts(
6278 concurrent_current_part_index,
6279 coordinate_begin_index, coordinate_end_index,
6280 this->coordinate_permutations,
6281 mj_current_dim_coords,
6282 this->assigned_part_ids,
6287 this->my_incomplete_cut_count[kk] = 0;
6289 obtained_part_index += partition_count;
6296 mj_scalar_t used_imbalance = 0;
6301 mj_current_dim_coords,
6304 current_concurrent_num_parts,
6305 current_cut_coordinates,
6306 total_incomplete_cut_count,
6307 num_partitioning_in_current_dim);
6312 mj_part_t output_array_shift = 0;
6313 mj_part_t cut_shift = 0;
6314 size_t tlr_shift = 0;
6315 size_t partweight_array_shift = 0;
6317 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6318 mj_part_t current_concurrent_work_part = current_work_part + kk;
6319 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6322 if((num_parts != 1 )
6324 this->global_min_max_coord_total_weight[kk] >
6325 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6329 for(mj_part_t jj = 0; jj < num_parts; ++jj){
6330 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6332 cut_shift += num_parts - 1;
6333 tlr_shift += (4 *(num_parts - 1) + 1);
6334 output_array_shift += num_parts;
6335 partweight_array_shift += (2 * (num_parts - 1) + 1);
6339 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6340 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6341 current_concurrent_work_part -1];
6342 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6343 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6348 for(
int ii = 0; ii < this->num_threads; ++ii){
6349 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6353 if(this->mj_keep_part_boxes){
6355 for (mj_part_t j = 0; j < num_parts - 1; ++j){
6356 (*output_part_boxes)[output_array_shift + output_part_index +
6357 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6360 (*output_part_boxes)[output_array_shift + output_part_index + j +
6361 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6367 this->mj_create_new_partitions(
6369 mj_current_dim_coords,
6370 current_concurrent_cut_coordinate,
6373 used_local_cut_line_weight_to_left,
6374 this->thread_part_weight_work,
6375 this->new_part_xadj + output_part_index + output_array_shift
6382 mj_lno_t part_size = coordinate_end - coordinate_begin;
6383 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6385 this->new_coordinate_permutations + coordinate_begin,
6386 this->coordinate_permutations + coordinate_begin,
6387 part_size *
sizeof(mj_lno_t));
6389 cut_shift += num_parts - 1;
6390 tlr_shift += (4 *(num_parts - 1) + 1);
6391 output_array_shift += num_parts;
6392 partweight_array_shift += (2 * (num_parts - 1) + 1);
6402 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6403 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6404 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6406 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6409 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6411 output_part_index += num_parts ;
6418 int current_world_size = this->comm->getSize();
6419 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6422 bool is_migrated_in_current_dimension =
false;
6427 if (future_num_parts > 1 &&
6428 this->check_migrate_avoid_migration_option >= 0 &&
6429 current_world_size > 1){
6431 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6432 mj_part_t num_parts = output_part_count_in_dimension;
6433 if ( this->mj_perform_migration(
6436 next_future_num_parts_in_parts,
6437 output_part_begin_index,
6438 migration_reduce_all_population,
6439 this->num_local_coords / (future_num_parts * current_num_parts),
6441 input_part_boxes, output_part_boxes) ) {
6442 is_migrated_in_current_dimension =
true;
6443 is_data_ever_migrated =
true;
6444 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
6447 this->total_dim_num_reduce_all /= num_parts;
6450 is_migrated_in_current_dimension =
false;
6451 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6456 mj_lno_t * tmp = this->coordinate_permutations;
6457 this->coordinate_permutations = this->new_coordinate_permutations;
6458 this->new_coordinate_permutations = tmp;
6460 if(!is_migrated_in_current_dimension){
6461 this->total_dim_num_reduce_all -= current_num_parts;
6462 current_num_parts = output_part_count_in_dimension;
6464 freeArray<mj_lno_t>(this->part_xadj);
6465 this->part_xadj = this->new_part_xadj;
6466 this->new_part_xadj = NULL;
6467 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6471 delete future_num_part_in_parts;
6472 delete next_future_num_parts_in_parts;
6474 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6481 this->set_final_parts(
6483 output_part_begin_index,
6485 is_data_ever_migrated);
6487 result_assigned_part_ids_ = this->assigned_part_ids;
6488 result_mj_gnos_ = this->current_mj_gnos;
6490 this->free_work_memory();
6491 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6492 this->mj_env->debug(3,
"Out of MultiJagged");
6500 template <
typename Adapter>
6505 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6508 typedef typename Adapter::scalar_t mj_scalar_t;
6509 typedef typename Adapter::gno_t mj_gno_t;
6510 typedef typename Adapter::lno_t mj_lno_t;
6511 typedef typename Adapter::node_t mj_node_t;
6514 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6518 RCP<const Environment> mj_env;
6519 RCP<const Comm<int> > mj_problemComm;
6520 RCP<const coordinateModel_t> mj_coords;
6523 double imbalance_tolerance;
6524 size_t num_global_parts;
6525 mj_part_t *part_no_array;
6526 int recursion_depth;
6529 mj_lno_t num_local_coords;
6530 mj_gno_t num_global_coords;
6531 const mj_gno_t *initial_mj_gnos;
6532 mj_scalar_t **mj_coordinates;
6534 int num_weights_per_coord;
6535 bool *mj_uniform_weights;
6536 mj_scalar_t **mj_weights;
6537 bool *mj_uniform_parts;
6538 mj_scalar_t **mj_part_sizes;
6540 bool distribute_points_on_cut_lines;
6541 mj_part_t max_concurrent_part_calculation;
6542 int check_migrate_avoid_migration_option;
6545 mj_scalar_t minimum_migration_imbalance;
6546 bool mj_keep_part_boxes;
6551 int mj_premigration_option;
6552 int min_coord_per_rank_for_premigration;
6554 ArrayRCP<mj_part_t> comXAdj_;
6555 ArrayRCP<mj_part_t> comAdj_;
6561 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6563 void set_up_partitioning_data(
6566 void set_input_parameters(
const Teuchos::ParameterList &p);
6568 void free_work_memory();
6570 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6572 bool mj_premigrate_to_subset(
int used_num_ranks,
int migration_selection_option,
6573 RCP<const Environment> mj_env_,
6574 RCP<
const Comm<int> > mj_problemComm_,
6576 mj_lno_t num_local_coords_,
6577 mj_gno_t num_global_coords_,
size_t num_global_parts_,
6578 const mj_gno_t *initial_mj_gnos_,
6579 mj_scalar_t **mj_coordinates_,
6580 int num_weights_per_coord_,
6581 mj_scalar_t **mj_weights_,
6583 RCP<
const Comm<int> > &result_problemComm_,
6584 mj_lno_t & result_num_local_coords_,
6585 mj_gno_t * &result_initial_mj_gnos_,
6586 mj_scalar_t ** &result_mj_coordinates_,
6587 mj_scalar_t ** &result_mj_weights_,
6588 int * &result_actual_owner_rank_);
6593 RCP<
const Comm<int> > &problemComm,
6594 const RCP<const coordinateModel_t> &coords) :
6595 mj_partitioner(), mj_env(env),
6596 mj_problemComm(problemComm),
6598 imbalance_tolerance(0),
6599 num_global_parts(1), part_no_array(NULL),
6601 coord_dim(0),num_local_coords(0), num_global_coords(0),
6602 initial_mj_gnos(NULL), mj_coordinates(NULL),
6603 num_weights_per_coord(0),
6604 mj_uniform_weights(NULL), mj_weights(NULL),
6605 mj_uniform_parts(NULL),
6606 mj_part_sizes(NULL),
6607 distribute_points_on_cut_lines(true),
6608 max_concurrent_part_calculation(1),
6609 check_migrate_avoid_migration_option(0), migration_type(0),
6610 minimum_migration_imbalance(0.30),
6611 mj_keep_part_boxes(false), num_threads(1), mj_run_as_rcb(false),mj_premigration_option(0), min_coord_per_rank_for_premigration(32000),
6612 comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6615 if (coordinate_ArrayRCP_holder != NULL){
6616 delete [] this->coordinate_ArrayRCP_holder;
6617 this->coordinate_ArrayRCP_holder = NULL;
6625 const bool bUnsorted =
true;
6626 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6628 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
6629 "algorithm. As many as the dimension count.", mj_parts_Validator);
6631 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
6634 pl.set(
"mj_minimum_migration_imbalance", 1.1,
6635 "mj_minimum_migration_imbalance, the minimum imbalance of the "
6636 "processors to avoid migration",
6639 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6640 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6641 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
6642 "depending on the imbalance, 1 for forcing migration, 2 for "
6643 "avoiding migration", mj_migration_option_validator);
6648 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6649 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6650 pl.set(
"mj_migration_type", 0,
"Migration type, 0 for migration to minimize the imbalance "
6651 "1 for migration to minimize messages exchanged the migration." ,
6652 mj_migration_option_validator);
6655 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
6659 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
6662 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
6665 RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6666 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6668 pl.set(
"mj_premigration_option", 0,
"Whether to do premigration or not. 0 for no migration "
6669 "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6670 , mj_premigration_option_validator);
6672 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to assign each rank in multijagged after premigration"
6687 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6691 mj_part_t
pointAssign(
int dim, mj_scalar_t *point)
const;
6693 void boxAssign(
int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6694 size_t &nPartsFound, mj_part_t **partsFound)
const;
6701 ArrayRCP<mj_part_t> &comXAdj,
6702 ArrayRCP<mj_part_t> &comAdj);
6708 template <
typename Adapter>
6709 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
int used_num_ranks,
6710 int migration_selection_option,
6711 RCP<const Environment> mj_env_,
6712 RCP<
const Comm<int> > mj_problemComm_,
6714 mj_lno_t num_local_coords_,
6715 mj_gno_t num_global_coords_,
size_t num_global_parts_,
6716 const mj_gno_t *initial_mj_gnos_,
6717 mj_scalar_t **mj_coordinates_,
6718 int num_weights_per_coord_,
6719 mj_scalar_t **mj_weights_,
6721 RCP<
const Comm<int> > &result_problemComm_,
6722 mj_lno_t &result_num_local_coords_,
6723 mj_gno_t * &result_initial_mj_gnos_,
6724 mj_scalar_t ** &result_mj_coordinates_,
6725 mj_scalar_t ** &result_mj_weights_,
6726 int * &result_actual_owner_rank_){
6727 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
6730 int myRank = mj_problemComm_->getRank();
6731 int worldSize = mj_problemComm_->getSize();
6733 mj_part_t groupsize = worldSize / used_num_ranks;
6737 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
6739 mj_part_t i_am_sending_to = 0;
6740 bool am_i_a_reciever =
false;
6742 for(
int i = 0; i < used_num_ranks; ++i){
6743 group_begins[i+ 1] = group_begins[i] + groupsize;
6744 if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
6745 if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
6746 if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
6747 if (myRank == group_begins[i]) am_i_a_reciever=
true;
6750 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
6751 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
6754 Tpetra::Distributor distributor(mj_problemComm_);
6756 std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
6757 ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
6758 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6759 result_num_local_coords_ = num_incoming_gnos;
6760 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
6762 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
6766 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6768 ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
6769 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6771 result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
6773 result_initial_mj_gnos_,
6774 received_gnos.getRawPtr(),
6775 num_incoming_gnos *
sizeof(mj_gno_t));
6779 result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
6780 for (
int i = 0; i < coord_dim_; ++i){
6781 ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
6782 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6783 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
6784 result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6786 result_mj_coordinates_[i],
6787 received_coord.getRawPtr(),
6788 num_incoming_gnos *
sizeof(mj_scalar_t));
6791 result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
6793 for (
int i = 0; i < num_weights_per_coord_; ++i){
6794 ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
6795 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6796 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
6797 result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6799 result_mj_weights_[i],
6800 received_weight.getRawPtr(),
6801 num_incoming_gnos *
sizeof(mj_scalar_t));
6806 std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
6807 ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
6808 ArrayRCP<int> received_owners(num_incoming_gnos);
6809 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
6810 result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
6812 result_actual_owner_rank_,
6813 received_owners.getRawPtr(),
6814 num_incoming_gnos *
sizeof(int));
6816 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
6817 return am_i_a_reciever;
6835 template <
typename Adapter>
6840 this->set_up_partitioning_data(solution);
6841 this->set_input_parameters(this->mj_env->getParameters());
6842 if (this->mj_keep_part_boxes){
6843 this->mj_partitioner.set_to_keep_part_boxes();
6845 this->mj_partitioner.set_partitioning_parameters(
6846 this->distribute_points_on_cut_lines,
6847 this->max_concurrent_part_calculation,
6848 this->check_migrate_avoid_migration_option,
6849 this->minimum_migration_imbalance, this->migration_type);
6852 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
6853 mj_lno_t result_num_local_coords = this->num_local_coords;
6854 mj_gno_t * result_initial_mj_gnos = NULL;
6855 mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
6856 mj_scalar_t **result_mj_weights = this->mj_weights;
6857 int *result_actual_owner_rank = NULL;
6858 const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
6875 int current_world_size = this->mj_problemComm->getSize();
6876 mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
6877 bool is_pre_migrated =
false;
6878 bool am_i_in_subset =
true;
6879 if ( mj_premigration_option > 0 &&
6880 size_t (current_world_size) > this->num_global_parts &&
6881 this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
6882 if (this->mj_keep_part_boxes){
6883 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
6885 is_pre_migrated =
true;
6886 int migration_selection_option = mj_premigration_option;
6887 if(migration_selection_option * this->num_global_parts > (
size_t) (current_world_size)){
6888 migration_selection_option = current_world_size / this->num_global_parts;
6890 int used_num_ranks = int (this->num_global_coords /
float (threshold_num_local_coords) + 0.5);
6891 if (used_num_ranks == 0) used_num_ranks = 1;
6893 am_i_in_subset = this->mj_premigrate_to_subset(
6895 migration_selection_option,
6897 this->mj_problemComm,
6899 this->num_local_coords,
6900 this->num_global_coords,
6901 this->num_global_parts,
6902 this->initial_mj_gnos,
6903 this->mj_coordinates,
6904 this->num_weights_per_coord,
6908 result_num_local_coords,
6909 result_initial_mj_gnos,
6910 result_mj_coordinates,
6912 result_actual_owner_rank);
6913 result_initial_mj_gnos_ = result_initial_mj_gnos;
6918 mj_part_t *result_assigned_part_ids = NULL;
6919 mj_gno_t *result_mj_gnos = NULL;
6921 if (am_i_in_subset){
6922 this->mj_partitioner.multi_jagged_part(
6926 this->imbalance_tolerance,
6927 this->num_global_parts,
6928 this->part_no_array,
6929 this->recursion_depth,
6932 result_num_local_coords,
6933 this->num_global_coords,
6934 result_initial_mj_gnos_,
6935 result_mj_coordinates,
6937 this->num_weights_per_coord,
6938 this->mj_uniform_weights,
6940 this->mj_uniform_parts,
6941 this->mj_part_sizes,
6943 result_assigned_part_ids,
6951 #if defined(__cplusplus) && __cplusplus >= 201103L
6952 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6953 localGidToLid.reserve(result_num_local_coords);
6954 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6955 localGidToLid[result_initial_mj_gnos_[i]] = i;
6956 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
6957 0, result_num_local_coords,
true);
6959 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6960 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6961 partId[origLID] = result_assigned_part_ids[i];
6965 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6966 localGidToLid(result_num_local_coords);
6967 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6968 localGidToLid.put(result_initial_mj_gnos_[i], i);
6970 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
6971 0, result_num_local_coords,
true);
6973 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6974 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6975 partId[origLID] = result_assigned_part_ids[i];
6978 #endif // C++11 is enabled
6980 delete [] result_mj_gnos;
6981 delete [] result_assigned_part_ids;
6986 if (is_pre_migrated){
6987 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
6988 Tpetra::Distributor distributor(this->mj_problemComm);
6990 ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
6991 mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
6992 if (num_incoming_gnos != this->num_local_coords){
6993 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
6995 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
6996 mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
6997 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6998 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
7000 ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7001 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7004 ArrayView<mj_part_t> sent_partnos(partId());
7005 distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7007 partId = arcp(
new mj_part_t[this->num_local_coords],
7008 0, this->num_local_coords,
true);
7011 #if defined(__cplusplus) && __cplusplus >= 201103L
7012 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7013 localGidToLid2.reserve(this->num_local_coords);
7014 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7015 localGidToLid2[this->initial_mj_gnos[i]] = i;
7018 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7019 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7020 partId[origLID] = received_partids[i];
7024 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7025 localGidToLid2(this->num_local_coords);
7026 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7027 localGidToLid2.put(this->initial_mj_gnos[i], i);
7030 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7031 mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7032 partId[origLID] = received_partids[i];
7035 #endif // C++11 is enabled
7040 freeArray<mj_gno_t> (result_initial_mj_gnos);
7041 for (
int i = 0; i < this->coord_dim; ++i){
7042 freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7044 freeArray<mj_scalar_t *> (result_mj_coordinates);
7046 for (
int i = 0; i < this->num_weights_per_coord; ++i){
7047 freeArray<mj_scalar_t> (result_mj_weights[i]);
7049 freeArray<mj_scalar_t *> (result_mj_weights);
7050 freeArray<int> (result_actual_owner_rank);
7052 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
7056 solution->setParts(partId);
7057 this->free_work_memory();
7062 template <
typename Adapter>
7064 freeArray<mj_scalar_t *>(this->mj_coordinates);
7065 freeArray<mj_scalar_t *>(this->mj_weights);
7066 freeArray<bool>(this->mj_uniform_parts);
7067 freeArray<mj_scalar_t *>(this->mj_part_sizes);
7068 freeArray<bool>(this->mj_uniform_weights);
7074 template <
typename Adapter>
7075 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7076 const RCP<PartitioningSolution<Adapter> > &solution
7079 this->coord_dim = this->mj_coords->getCoordinateDim();
7080 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7081 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7082 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7083 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7088 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7091 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7092 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7095 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7098 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7100 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7102 typedef StridedData<mj_lno_t, mj_scalar_t> input_t;
7103 ArrayView<const mj_gno_t> gnos;
7104 ArrayView<input_t> xyz;
7105 ArrayView<input_t> wgts;
7108 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7110 this->mj_coords->getCoordinates(gnos, xyz, wgts);
7112 ArrayView<const mj_gno_t> mj_gnos = gnos;
7113 this->initial_mj_gnos = mj_gnos.getRawPtr();
7116 for (
int dim=0; dim < this->coord_dim; dim++){
7117 ArrayRCP<const mj_scalar_t> ar;
7118 xyz[dim].getInputArray(ar);
7119 this->coordinate_ArrayRCP_holder[dim] = ar;
7122 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7126 if (this->num_weights_per_coord == 0){
7127 this->mj_uniform_weights[0] =
true;
7128 this->mj_weights[0] = NULL;
7132 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7133 ArrayRCP<const mj_scalar_t> ar;
7134 wgts[wdim].getInputArray(ar);
7135 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7136 this->mj_uniform_weights[wdim] =
false;
7137 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7141 for (
int wdim = 0; wdim < criteria_dim; wdim++){
7142 if (solution->criteriaHasUniformPartSizes(wdim)){
7143 this->mj_uniform_parts[wdim] =
true;
7144 this->mj_part_sizes[wdim] = NULL;
7147 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
7156 template <
typename Adapter>
7157 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(
const Teuchos::ParameterList &pl){
7159 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
7162 tol = pe->getValue(&tol);
7163 this->imbalance_tolerance = tol - 1.0;
7167 if (this->imbalance_tolerance <= 0)
7168 this->imbalance_tolerance= 10e-4;
7171 this->part_no_array = NULL;
7173 this->recursion_depth = 0;
7175 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
7176 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
7177 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
7178 this->mj_env->debug(2,
"mj_parts provided by user");
7182 this->distribute_points_on_cut_lines =
true;
7183 this->max_concurrent_part_calculation = 1;
7185 this->mj_run_as_rcb =
false;
7186 this->mj_premigration_option = 0;
7187 this->min_coord_per_rank_for_premigration = 32000;
7189 int mj_user_recursion_depth = -1;
7190 this->mj_keep_part_boxes =
false;
7191 this->check_migrate_avoid_migration_option = 0;
7192 this->migration_type = 0;
7193 this->minimum_migration_imbalance = 0.35;
7195 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
7198 imb = pe->getValue(&imb);
7199 this->minimum_migration_imbalance = imb - 1.0;
7202 pe = pl.getEntryPtr(
"mj_migration_option");
7204 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7206 this->check_migrate_avoid_migration_option = 0;
7208 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7211 pe = pl.getEntryPtr(
"mj_migration_type");
7213 this->migration_type = pe->getValue(&this->migration_type);
7215 this->migration_type = 0;
7220 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
7222 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7224 this->max_concurrent_part_calculation = 1;
7227 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
7229 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7231 this->mj_keep_part_boxes =
false;
7243 if (this->mj_keep_part_boxes ==
false){
7244 pe = pl.getEntryPtr(
"mapping_type");
7246 int mapping_type = -1;
7247 mapping_type = pe->getValue(&mapping_type);
7248 if (mapping_type == 0){
7249 mj_keep_part_boxes =
true;
7255 pe = pl.getEntryPtr(
"mj_enable_rcb");
7257 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7259 this->mj_run_as_rcb =
false;
7262 pe = pl.getEntryPtr(
"mj_premigration_option");
7264 mj_premigration_option = pe->getValue(&mj_premigration_option);
7266 mj_premigration_option = 0;
7269 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
7271 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
7273 min_coord_per_rank_for_premigration = 32000;
7276 pe = pl.getEntryPtr(
"mj_recursion_depth");
7278 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7280 mj_user_recursion_depth = -1;
7284 pe = pl.getEntryPtr(
"rectilinear");
7285 if (pe) val = pe->getValue(&val);
7287 this->distribute_points_on_cut_lines =
false;
7289 this->distribute_points_on_cut_lines =
true;
7292 if (this->mj_run_as_rcb){
7293 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7295 if (this->recursion_depth < 1){
7296 if (mj_user_recursion_depth > 0){
7297 this->recursion_depth = mj_user_recursion_depth;
7300 this->recursion_depth = this->coord_dim;
7304 this->num_threads = 1;
7305 #ifdef HAVE_ZOLTAN2_OMP
7306 #pragma omp parallel
7308 this->num_threads = omp_get_num_threads();
7315 template <
typename Adapter>
7318 typename Adapter::scalar_t *lower,
7319 typename Adapter::scalar_t *upper,
7320 size_t &nPartsFound,
7330 if (this->mj_keep_part_boxes) {
7333 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7335 size_t nBoxes = (*partBoxes).size();
7337 throw std::logic_error(
"no part boxes exist");
7341 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7343 if (globalBox->boxesOverlap(dim, lower, upper)) {
7345 std::vector<typename Adapter::part_t> partlist;
7348 for (
size_t i = 0; i < nBoxes; i++) {
7350 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7352 partlist.push_back((*partBoxes)[i].getpId());
7373 *partsFound =
new mj_part_t[nPartsFound];
7374 for (
size_t i = 0; i < nPartsFound; i++)
7375 (*partsFound)[i] = partlist[i];
7388 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
7393 template <
typename Adapter>
7396 typename Adapter::scalar_t *point)
const
7403 if (this->mj_keep_part_boxes) {
7407 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7409 size_t nBoxes = (*partBoxes).size();
7411 throw std::logic_error(
"no part boxes exist");
7415 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7417 if (globalBox->pointInBox(dim, point)) {
7421 for (i = 0; i < nBoxes; i++) {
7423 if ((*partBoxes)[i].pointInBox(dim, point)) {
7424 foundPart = (*partBoxes)[i].getpId();
7438 std::ostringstream oss;
7440 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
7441 oss <<
") not found in domain";
7442 throw std::logic_error(oss.str());
7451 size_t closestBox = 0;
7452 mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
7453 mj_scalar_t *centroid =
new mj_scalar_t[dim];
7454 for (
size_t i = 0; i < nBoxes; i++) {
7455 (*partBoxes)[i].computeCentroid(centroid);
7456 mj_scalar_t sum = 0.;
7458 for (
int j = 0; j < dim; j++) {
7459 diff = centroid[j] - point[j];
7462 if (sum < minDistance) {
7467 foundPart = (*partBoxes)[closestBox].getpId();
7474 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
7478 template <
typename Adapter>
7484 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7485 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7486 mj_part_t ntasks = (*pBoxes).size();
7487 int dim = (*pBoxes)[0].getDim();
7496 template <
typename Adapter>
7497 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7500 return this->mj_partitioner.get_kept_boxes();
7504 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7506 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7509 if (this->mj_keep_part_boxes)
7510 return this->kept_boxes;
7512 throw std::logic_error(
"Error: part boxes are not stored.");
7515 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7517 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7519 RCP<mj_partBoxVector_t> &localPartBoxes
7522 mj_part_t ntasks = this->num_global_parts;
7523 int dim = (*localPartBoxes)[0].getDim();
7524 mj_scalar_t *localPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
7526 memset(localPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
7528 mj_scalar_t *globalPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
7529 memset(globalPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
7531 mj_scalar_t *localPartMins = localPartBoundaries;
7532 mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7534 mj_scalar_t *globalPartMins = globalPartBoundaries;
7535 mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7537 mj_part_t boxCount = localPartBoxes->size();
7538 for (mj_part_t i = 0; i < boxCount; ++i){
7539 mj_part_t pId = (*localPartBoxes)[i].getpId();
7542 mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
7543 mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7545 for (
int j = 0; j < dim; ++j){
7546 localPartMins[dim * pId + j] = lmins[j];
7547 localPartMaxs[dim * pId + j] = lmaxs[j];
7559 reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
7560 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7561 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
7562 for (mj_part_t i = 0; i < ntasks; ++i){
7564 globalPartMins + dim * i,
7565 globalPartMaxs + dim * i);
7577 delete []localPartBoundaries;
7578 delete []globalPartBoundaries;