26 #define z_swap(_a_, _b_, _t_) do { (_t_) = (_a_); (_a_) = (_b_); (_b_) = (_t_); } while (0)
33 static inline void sort_node_indices_sort_bubble(INT n, INT *keys)
37 for (i = 0; i < n; ++i)
40 while (j > 0 && keys[2 * j + 0] < keys[2 * (j - 1) + 0])
42 z_swap(keys[2 * j + 0], keys[2 * (j - 1) + 0], ti);
43 z_swap(keys[2 * j + 1], keys[2 * (j - 1) + 1], ti);
54 static inline void sort_node_indices_radix_count(INT n, INT *keys, INT shift, INT mask, INT *counts)
58 for (i = 0; i < n; ++i)
60 k = (keys[2 * i + 0] >> shift) & mask;
70 static inline void sort_node_indices_radix_rearrange(INT n, INT *keys_in, INT *keys_out, INT shift, INT mask, INT *displs)
74 for (i = 0; i < n; ++i)
76 k = (keys_in[2 * i + 0] >> shift) & mask;
77 keys_out[2 * displs[k] + 0] = keys_in[2 * i + 0];
78 keys_out[2 * displs[k] + 1] = keys_in[2 * i + 1];
84 #define radix_n (1 << rwidth)
91 void Y(sort_node_indices_radix_lsdf)(INT n, INT *keys0, INT *keys1, INT rhigh)
93 const INT radix_mask = radix_n - 1;
94 const INT rhigh_in = rhigh;
98 omp_get_max_threads();
103 INT *from, *to, *tmp;
108 INT tid = 0, tnum = 1;
110 STACK_MALLOC(INT*, lcounts, (
size_t)(tmax * radix_n) *
sizeof(INT));
118 #pragma omp parallel private(tid, tnum, i, l, h)
120 tid = omp_get_thread_num();
121 tnum = omp_get_num_threads();
124 for (i = 0; i < radix_n; ++i) lcounts[tid * radix_n + i] = 0;
126 l = (tid * n) / tnum;
127 h = ((tid + 1) * n) / tnum;
129 sort_node_indices_radix_count(h - l, from + (2 * l), rhigh_in - rhigh, radix_mask, &lcounts[tid * radix_n]);
135 for (i = 0; i < radix_n; ++i)
137 for (l = 0; l < tmax; ++l) lcounts[l * radix_n + i] = (k += lcounts[l * radix_n + i]) - lcounts[l * radix_n + i];
141 #pragma omp parallel private(tid, tnum, i, l, h)
143 tid = omp_get_thread_num();
144 tnum = omp_get_num_threads();
147 l = (tid * n) / tnum;
148 h = ((tid + 1) * n) / tnum;
150 sort_node_indices_radix_rearrange(h - l, from + (2 * l), to, rhigh_in - rhigh, radix_mask, &lcounts[tid * radix_n]);
164 if (to == keys0) memcpy(to, from, (
size_t)(n) * 2 *
sizeof(INT));
174 void Y(sort_node_indices_radix_msdf)(INT n, INT *keys0, INT *keys1, INT rhigh)
176 const INT radix_mask = radix_n - 1;
180 omp_get_max_threads();
188 INT counts[radix_n], displs[radix_n];
190 INT tid = 0, tnum = 1;
192 STACK_MALLOC(INT*, lcounts, (
size_t)(tmax * radix_n) *
sizeof(INT));
197 #pragma omp parallel private(tid, tnum, i, l, h)
199 tid = omp_get_thread_num();
200 tnum = omp_get_num_threads();
203 for (i = 0; i < radix_n; ++i) lcounts[tid * radix_n + i] = 0;
205 l = (tid * n) / tnum;
206 h = ((tid + 1) * n) / tnum;
208 sort_node_indices_radix_count(h - l, keys0 + (2 * l), rhigh + 1, radix_mask, &lcounts[tid * radix_n]);
214 for (i = 0; i < radix_n; ++i)
216 for (l = 0; l < tmax; ++l) lcounts[l * radix_n + i] = (k += lcounts[l * radix_n + i]) - lcounts[l * radix_n + i];
218 displs[i] = lcounts[0 * radix_n + i];
219 if (i > 0) counts[i - 1] = displs[i] - displs[i - 1];
221 counts[radix_n - 1] = n - displs[radix_n - 1];
224 #pragma omp parallel private(tid, tnum, i, l, h)
226 tid = omp_get_thread_num();
227 tnum = omp_get_num_threads();
230 l = (tid * n) / tnum;
231 h = ((tid + 1) * n) / tnum;
233 sort_node_indices_radix_rearrange(h - l, keys0 + (2 * l), keys1, rhigh + 1, radix_mask, &lcounts[tid * radix_n]);
238 memcpy(keys0, keys1, (
size_t)(n) * 2 *
sizeof(INT));
242 for (i = 0; i < radix_n; ++i)
247 Y(sort_node_indices_radix_msdf)(counts[i], keys0 + 2 * displs[i], keys1 + 2 * displs[i], rhigh);
249 sort_node_indices_sort_bubble(counts[i], keys0 + 2 * displs[i]);