48 return a >= b ? a : b;
57 return (R)(n) *
fak(n - 1);
72 for (k = 0; k <= m - r; k++)
74 sum +=
binom(m + k, k) * POW((xx + K(1.0)) / K(2.0), (R) k);
76 return sum * POW((xx + K(1.0)), (R) r) * POW(K(1.0) - xx, (R) (m + 1))
77 / (R)(1 << (m + 1)) /
fak(r);
81 C
regkern(kernel k, R xx,
int p,
const R *param, R a, R b)
90 if ((xx >= -K(0.5) + b && xx <= -a) || (xx >= a && xx <= K(0.5) - b))
92 return k(xx, 0, param);
94 else if (xx < -K(0.5) + b)
96 sum = (k(-K(0.5), 0, param) + k(K(0.5), 0, param)) / K(2.0)
97 *
BasisPoly(p - 1, 0, K(2.0) * xx / b + (K(1.0) - b) / b);
98 for (r = 0; r < p; r++)
100 sum += POW(-b / K(2.0), (R) r) * k(-K(0.5) + b, r, param)
101 *
BasisPoly(p - 1, r, -K(2.0) * xx / b + (b - K(1.0)) / b);
105 else if ((xx > -a) && (xx < a))
107 for (r = 0; r < p; r++)
111 * (k(-a, r, param) *
BasisPoly(p - 1, r, xx / a)
112 + k(a, r, param) *
BasisPoly(p - 1, r, -xx / a)
117 else if (xx > K(0.5) - b)
119 sum = (k(-K(0.5), 0, param) + k(K(0.5), 0, param)) / K(2.0)
120 *
BasisPoly(p - 1, 0, -K(2.0) * xx / b + (K(1.0) - b) / b);
121 for (r = 0; r < p; r++)
123 sum += POW(b / K(2.0), (R) r) * k(K(0.5) - b, r, param)
124 *
BasisPoly(p - 1, r, K(2.0) * xx / b - (K(1.0) - b) / b);
128 return k(xx, 0, param);
134 static C
regkern1(kernel k, R xx,
int p,
const R *param, R a, R b)
143 if ((xx >= -K(0.5) + b && xx <= -a) || (xx >= a && xx <= K(0.5) - b))
145 return k(xx, 0, param);
147 else if ((xx > -a) && (xx < a))
149 for (r = 0; r < p; r++)
153 * (k(-a, r, param) *
BasisPoly(p - 1, r, xx / a)
154 + k(a, r, param) *
BasisPoly(p - 1, r, -xx / a)
159 else if (xx < -K(0.5) + b)
161 for (r = 0; r < p; r++)
164 * (k(K(0.5) - b, r, param) *
BasisPoly(p - 1, r, (xx + K(0.5)) / b)
165 + k(-K(0.5) + b, r, param) *
BasisPoly(p - 1, r, -(xx + K(0.5)) / b)
170 else if (xx > K(0.5) - b)
172 for (r = 0; r < p; r++)
175 * (k(K(0.5) - b, r, param) *
BasisPoly(p - 1, r, (xx - K(0.5)) / b)
176 + k(-K(0.5) + b, r, param) *
BasisPoly(p - 1, r, -(xx - K(0.5)) / b)
181 return k(xx, 0, param);
230 static C
regkern3(kernel k, R xx,
int p,
const R *param, R a, R b)
243 if ((a <= xx) && (xx <= K(0.5) - b))
245 return k(xx, 0, param);
249 for (r = 0; r < p; r++)
251 sum += POW(-a, (R) r) * k(a, r, param)
257 else if ((K(0.5) - b < xx) && (xx <= K(0.5)))
259 sum = k(K(0.5), 0, param) *
BasisPoly(p - 1, 0, -K(2.0) * xx / b + (K(1.0) - b) / b);
261 for (r = 0; r < p; r++)
263 sum += POW(b / K(2.0), (R) r) * k(K(0.5) - b, r, param)
264 *
BasisPoly(p - 1, r, K(2.0) * xx / b - (K(1.0) - b) / b);
318 C
kubintkern(
const R x,
const C *Add,
const int Ad,
const R a)
347 return (-f0 * c1 * c3 * c4 / K(6.0) + f1 * c2 * c3 * c4 / K(2.0)
348 - f2 * c2 * c1 * c4 / K(2.0) + f3 * c2 * c1 * c3 / K(6.0));
352 static C
kubintkern1(
const R x,
const C *Add,
const int Ad,
const R a)
358 c = (x + a) * (R)(Ad) / K(2.0) / a;
376 return (-f0 * c1 * c3 * c4 / K(6.0) + f1 * c2 * c3 * c4 / K(2.0)
377 - f2 * c2 * c1 * c4 / K(2.0) + f3 * c2 * c1 * c3 / K(6.0));
381 static void quicksort(
int d,
int t, R *x, C *alpha,
int *permutation_x_alpha,
int N)
386 R pivot = x[(N / 2) * d + t];
395 while (x[lpos * d + t] < pivot)
397 while (x[rpos * d + t] > pivot)
401 for (k = 0; k < d; k++)
403 temp1 = x[lpos * d + k];
404 x[lpos * d + k] = x[rpos * d + k];
405 x[rpos * d + k] = temp1;
408 alpha[lpos] = alpha[rpos];
411 if (permutation_x_alpha)
413 temp_int = permutation_x_alpha[lpos];
414 permutation_x_alpha[lpos] = permutation_x_alpha[rpos];
415 permutation_x_alpha[rpos] = temp_int;
423 quicksort(d, t, x, alpha, permutation_x_alpha, rpos + 1);
425 quicksort(d, t, x + lpos * d, alpha + lpos, permutation_x_alpha ? permutation_x_alpha + lpos : NULL, N - lpos);
435 box_index = (
int *) NFFT(malloc)((size_t)(ths->box_count) *
sizeof(int));
436 for (t = 0; t < ths->box_count; t++)
439 for (l = 0; l < ths->
N_total; l++)
442 for (t = 0; t < ths->
d; t++)
444 val[t] = ths->
x[ths->
d * l + t] + K(0.25) - ths->
eps_B / K(2.0);
445 ind *= ths->box_count_per_dim;
446 ind += (int) (val[t] / ths->
eps_I);
451 ths->box_offset[0] = 0;
452 for (t = 1; t <= ths->box_count; t++)
454 ths->box_offset[t] = ths->box_offset[t - 1] + box_index[t - 1];
455 box_index[t - 1] = ths->box_offset[t - 1];
458 for (l = 0; l < ths->
N_total; l++)
461 for (t = 0; t < ths->
d; t++)
463 val[t] = ths->
x[ths->
d * l + t] + K(0.25) - ths->
eps_B / K(2.0);
464 ind *= ths->box_count_per_dim;
465 ind += (int) (val[t] / ths->
eps_I);
468 ths->box_alpha[box_index[ind]] = ths->
alpha[l];
470 for (t = 0; t < ths->
d; t++)
472 ths->box_x[ths->
d * box_index[ind] + t] = ths->
x[ths->
d * l + t];
476 NFFT(free)(box_index);
481 int end_lt,
const C *Add,
const int Ad,
int p, R a,
const kernel k,
482 const R *param,
const unsigned flags)
489 for (m = start; m < end_lt; m++)
498 for (l = 0; l < d; l++)
499 r += (y[l] - x[m * d + l]) * (y[l] - x[m * d + l]);
504 result += alpha[m] * k(r, 0, param);
508 result -= alpha[m] *
regkern1(k, r, p, param, a, K(1.0) / K(16.0));
515 result -= alpha[m] *
regkern(k, r, p, param, a, K(1.0) / K(16.0));
518 result -= alpha[m] *
kubintkern(r, Add, Ad, a);
519 #elif defined(NF_QUADR)
520 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
521 #elif defined(NF_LIN)
522 result -= alpha[m]*linintkern(r,Add,Ad,a);
524 #error define interpolation method
537 int y_multiind[ths->
d];
538 int multiindex[ths->
d];
541 for (t = 0; t < ths->
d; t++)
543 y_multiind[t] = (int)(LRINT((y[t] + K(0.25) - ths->
eps_B / K(2.0)) / ths->
eps_I));
548 for (y_ind =
max_i(0, y_multiind[0] - 1);
549 y_ind < ths->box_count_per_dim && y_ind <= y_multiind[0] + 1; y_ind++)
552 ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->
Add, ths->
Ad,
556 else if (ths->
d == 2)
558 for (multiindex[0] =
max_i(0, y_multiind[0] - 1);
559 multiindex[0] < ths->box_count_per_dim
560 && multiindex[0] <= y_multiind[0] + 1; multiindex[0]++)
561 for (multiindex[1] =
max_i(0, y_multiind[1] - 1);
562 multiindex[1] < ths->box_count_per_dim
563 && multiindex[1] <= y_multiind[1] + 1; multiindex[1]++)
565 y_ind = (ths->box_count_per_dim * multiindex[0]) + multiindex[1];
567 ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->
Add,
571 else if (ths->
d == 3)
573 for (multiindex[0] =
max_i(0, y_multiind[0] - 1);
574 multiindex[0] < ths->box_count_per_dim
575 && multiindex[0] <= y_multiind[0] + 1; multiindex[0]++)
576 for (multiindex[1] =
max_i(0, y_multiind[1] - 1);
577 multiindex[1] < ths->box_count_per_dim
578 && multiindex[1] <= y_multiind[1] + 1; multiindex[1]++)
579 for (multiindex[2] =
max_i(0, y_multiind[2] - 1);
580 multiindex[2] < ths->box_count_per_dim
581 && multiindex[2] <= y_multiind[2] + 1; multiindex[2]++)
583 y_ind = ((ths->box_count_per_dim * multiindex[0]) + multiindex[1])
584 * ths->box_count_per_dim + multiindex[2];
586 ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->
Add,
599 static void BuildTree(
int d,
int t, R *x, C *alpha,
int *permutation_x_alpha,
int N)
605 quicksort(d, t, x, alpha, permutation_x_alpha, N);
607 BuildTree(d, (t + 1) % d, x, alpha, permutation_x_alpha, m);
608 BuildTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), permutation_x_alpha ? permutation_x_alpha + (m + 1) : NULL, N - m - 1);
613 static C
SearchTree(
const int d,
const int t,
const R *x,
const C *alpha,
614 const R *xmin,
const R *xmax,
const int N,
const kernel k,
const R *param,
615 const int Ad,
const C *Add,
const int p,
const unsigned flags)
626 R Median = x[m * d + t];
627 R a = FABS(Max - Min) / 2;
633 return SearchTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), xmin,
634 xmax, N - m - 1, k, param, Ad, Add, p, flags);
635 else if (Max < Median)
636 return SearchTree(d, (t + 1) % d, x, alpha, xmin, xmax, m, k, param, Ad,
643 for (l = 0; l < d; l++)
645 if (x[m * d + l] > xmin[l] && x[m * d + l] < xmax[l])
653 r = xmin[0] + a - x[m];
658 for (l = 0; l < d; l++)
659 r += (xmin[l] + a - x[m * d + l]) * (xmin[l] + a - x[m * d + l]);
664 result += alpha[m] * k(r, 0, param);
668 result -= alpha[m] *
regkern1(k, r, p, param, a, K(1.0) / K(16.0));
675 result -= alpha[m] *
regkern(k, r, p, param, a, K(1.0) / K(16.0));
678 result -= alpha[m] *
kubintkern(r, Add, Ad, a);
679 #elif defined(NF_QUADR)
680 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
681 #elif defined(NF_LIN)
682 result -= alpha[m]*linintkern(r,Add,Ad,a);
684 #error define interpolation method
689 result +=
SearchTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), xmin,
690 xmax, N - m - 1, k, param, Ad, Add, p, flags);
691 result +=
SearchTree(d, (t + 1) % d, x, alpha, xmin, xmax, m, k, param, Ad, Add,
698 static void fastsum_precompute_kernel(
fastsum_plan *ths)
717 #pragma omp parallel for default(shared) private(k)
719 for (k = -ths->
Ad / 2 - 2; k <= ths->Ad / 2 + 2; k++)
725 #pragma omp parallel for default(shared) private(k)
727 for (k = 0; k <= ths->
Ad + 2; k++)
741 for (t = 0; t < ths->
d; t++)
745 #pragma omp parallel
for default(shared)
private(j,k,t)
747 for (j = 0; j < n_total; j++)
750 ths->
b[j] =
regkern1(ths->
k, (R) - (j / (R)(ths->
n) - K(0.5)), ths->
p,
756 for (t = 0; t < ths->
d; t++)
758 ths->
b[j] += ((R) (k % (ths->
n)) / (R)(ths->
n) - K(0.5))
759 * ((R) (k % (ths->
n)) / (R)(ths->
n) - K(0.5));
767 for (t = 0; t < ths->
d; t++)
770 NFFT(fftshift_complex)(ths->
b, (int)(ths->
d), N);
771 FFTW(execute)(ths->fft_plan);
772 NFFT(fftshift_complex)(ths->
b, (int)(ths->
d), N);
780 unsigned flags,
int nn,
int p, R eps_I, R eps_B)
786 int nthreads = NFFT(get_num_threads)();
805 ths->
Ad = 4 * (ths->
p) * (ths->
p);
806 ths->
Add = (C *) NFFT(malloc)((size_t)(ths->
Ad + 5) * (
sizeof(C)));
838 ths->
Ad =
max_i(10, (
int)(LRINT(CEIL(K(1.4) / POW(delta, K(1.0) / K(4.0))))));
839 ths->
Add = (C *) NFFT(malloc)((size_t)(ths->
Ad + 3) * (
sizeof(C)));
840 #elif defined(NF_QUADR)
841 ths->
Ad = (int)(LRINT(CEIL(K(2.2)/POW(delta,K(1.0)/K(3.0)))));
842 ths->
Add = (C *)NFFT(malloc)((size_t)(ths->
Ad+3)*(
sizeof(C)));
843 #elif defined(NF_LIN)
844 ths->
Ad = (int)(LRINT(CEIL(K(1.7)/pow(delta,K(1.0)/K(2.0)))));
845 ths->
Add = (C *)NFFT(malloc)((size_t)(ths->
Ad+3)*(
sizeof(C)));
847 #error define NF_LIN or NF_QUADR or NF_KUB
852 ths->
Ad = 2 * (ths->
p) * (ths->
p);
853 ths->
Add = (C *) NFFT(malloc)((size_t)(ths->
Ad + 3) * (
sizeof(C)));
859 for (t = 0; t < d; t++)
866 for (t = 0; t < d; t++)
869 ths->
b = (C*) NFFT(malloc)((size_t)(n_total) *
sizeof(C));
870 ths->
f_hat = (C*) NFFT(malloc)((size_t)(n_total) *
sizeof(C));
872 #pragma omp critical (nfft_omp_critical_fftw_plan)
874 FFTW(plan_with_nthreads)(nthreads);
877 ths->fft_plan = FFTW(plan_dft)(d, N, ths->
b, ths->
b, FFTW_FORWARD,
884 fastsum_precompute_kernel(ths);
890 int N[ths->
d], n[ths->
d];
891 unsigned sort_flags_adjoint = 0U;
896 sort_flags_adjoint = NFFT_SORT_NODES | NFFT_OMP_BLOCKWISE_ADJOINT;
898 sort_flags_adjoint = NFFT_SORT_NODES;
904 ths->
x = (R *) NFFT(malloc)((size_t)(ths->
d * N_total) * (
sizeof(R)));
905 ths->
alpha = (C *) NFFT(malloc)((size_t)(N_total) * (
sizeof(C)));
908 for (t = 0; t < ths->
d; t++)
911 n[t] = nn_oversampled;
914 NFFT(init_guru)(&(ths->mv1), ths->
d, N, N_total, n, m,
918 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
925 if (ths->
flags & NEARFIELD_BOXES)
927 ths->box_count_per_dim = (int)(LRINT(FLOOR((K(0.5) - ths->
eps_B) / ths->
eps_I))) + 1;
929 for (t = 0; t < ths->
d; t++)
930 ths->box_count *= ths->box_count_per_dim;
932 ths->box_offset = (
int *) NFFT(malloc)((size_t)(ths->box_count + 1) *
sizeof(int));
934 ths->box_alpha = (C *) NFFT(malloc)((size_t)(ths->
N_total) * (
sizeof(C)));
936 ths->box_x = (R *) NFFT(malloc)((size_t)(ths->
d * ths->
N_total) *
sizeof(R));
940 if (ths->
flags & STORE_PERMUTATION_X_ALPHA)
943 for (
int i=0; i<ths->
N_total; i++)
952 int N[ths->
d], n[ths->
d];
953 unsigned sort_flags_trafo = 0U;
956 sort_flags_trafo = NFFT_SORT_NODES;
960 ths->
y = (R *) NFFT(malloc)((size_t)(ths->
d * M_total) * (
sizeof(R)));
961 ths->
f = (C *) NFFT(malloc)((size_t)(M_total) * (
sizeof(C)));
964 for (t = 0; t < ths->
d; t++)
967 n[t] = nn_oversampled;
970 NFFT(init_guru)(&(ths->mv2), ths->
d, N, M_total, n, m,
974 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
982 kernel k, R *param,
unsigned flags,
int nn,
int m,
int p, R eps_I, R eps_B)
993 NFFT(free)(ths->
alpha);
995 NFFT(finalize)(&(ths->mv1));
997 if (ths->
flags & NEARFIELD_BOXES)
999 NFFT(free)(ths->box_offset);
1000 NFFT(free)(ths->box_alpha);
1001 NFFT(free)(ths->box_x);
1005 if (ths->
flags & STORE_PERMUTATION_X_ALPHA)
1016 NFFT(finalize)(&(ths->mv2));
1023 NFFT(free)(ths->
Add);
1026 #pragma omp critical (nfft_omp_critical_fftw_plan)
1029 FFTW(destroy_plan)(ths->fft_plan);
1035 NFFT(free)(ths->
f_hat);
1054 #pragma omp parallel for default(shared) private(j,k,t,r)
1056 for (j = 0; j < ths->
M_total; j++)
1059 for (k = 0; k < ths->
N_total; k++)
1062 r = ths->
y[j] - ths->
x[k];
1066 for (t = 0; t < ths->
d; t++)
1067 r += (ths->
y[j * ths->
d + t] - ths->
x[k * ths->
d + t])
1068 * (ths->
y[j * ths->
d + t] - ths->
x[k * ths->
d + t]);
1090 if (ths->
flags & NEARFIELD_BOXES)
1116 NFFT(precompute_lin_psi)(&(ths->mv1));
1119 NFFT(precompute_psi)(&(ths->mv1));
1122 NFFT(precompute_full_psi)(&(ths->mv1));
1152 NFFT(precompute_lin_psi)(&(ths->mv2));
1155 NFFT(precompute_psi)(&(ths->mv2));
1158 NFFT(precompute_full_psi)(&(ths->mv2));
1189 NFFT(adjoint)(&(ths->mv1));
1200 #pragma omp parallel for default(shared) private(k)
1202 for (k = 0; k < ths->mv2.
N_total; k++)
1203 ths->mv2.
f_hat[k] = ths->
b[k] * ths->mv1.
f_hat[k];
1213 NFFT(trafo)(&(ths->mv2));
1224 #pragma omp parallel for default(shared) private(j,k,t)
1226 for (j = 0; j < ths->
M_total; j++)
1228 R ymin[ths->
d], ymax[ths->
d];
1230 if (ths->
flags & NEARFIELD_BOXES)
1232 ths->
f[j] = ths->mv2.
f[j] +
SearchBox(ths->
y + ths->
d * j, ths);
1236 for (t = 0; t < ths->
d; t++)
1238 ymin[t] = ths->
y[ths->
d * j + t] - ths->
eps_I;
1239 ymax[t] = ths->
y[ths->
d * j + t] + ths->
eps_I;
1241 ths->
f[j] = ths->mv2.
f[j]
int M_total
number of target knots
static int max_i(int a, int b)
max
C regkern(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel with K_I arbitrary and K_B smooth to zero
static R fak(int n)
factorial
int * permutation_x_alpha
permutation vector of source nodes if STORE_PERMUTATION_X_ALPHA is set
#define EXACT_NEARFIELD
Constant symbols.
static R BasisPoly(int m, int r, R xx)
basis polynomial for regularized kernel
R nfft_elapsed_seconds(ticks t1, ticks t0)
Return number of elapsed seconds between two time points.
Header file with predefined kernels for the fast summation algorithm.
R * kernel_param
parameters for kernel function
void fastsum_init_guru_source_nodes(fastsum_plan *ths, int N_total, int nn_oversampled, int m)
initialize source nodes dependent part of fast summation plan
C * f_hat
Fourier coefficients of nfft plans.
C one_over_x(R x, int der, const R *param)
K(x) = 1/x.
void fastsum_finalize_kernel(fastsum_plan *ths)
finalization of fastsum plan
static C SearchBox(R *y, fastsum_plan *ths)
box-based near field correction
static C calc_SearchBox(int d, R *y, R *x, C *alpha, int start, int end_lt, const C *Add, const int Ad, int p, R a, const kernel k, const R *param, const unsigned flags)
inner computation function for box-based near field correction
plan for fast summation algorithm
C * b
expansion coefficients
static R binom(int n, int m)
binomial coefficient
void fastsum_precompute_source_nodes(fastsum_plan *ths)
precomputation for fastsum
Header file for the fast NFFT-based summation algorithm.
C * alpha
source coefficients
unsigned flags
flags precomp.
void fastsum_trafo(fastsum_plan *ths)
fast NFFT-based summation
static C regkern1(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel with K_I arbitrary and K_B periodized (used in 1D)
void fastsum_precompute(fastsum_plan *ths)
precomputation for fastsum
int N_total
number of source knots
void fastsum_finalize_target_nodes(fastsum_plan *ths)
finalization of fastsum plan
static C kubintkern1(const R x, const C *Add, const int Ad, const R a)
cubic spline interpolation in near field with arbitrary kernels
R MEASURE_TIME_t[8]
Measured time for each step if MEASURE_TIME is set.
static C regkern3(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel for even kernels with K_I even and K_B mirrored
void fastsum_init_guru(fastsum_plan *ths, int d, int N_total, int M_total, kernel k, R *param, unsigned flags, int nn, int m, int p, R eps_I, R eps_B)
initialization of fastsum plan
void fastsum_precompute_target_nodes(fastsum_plan *ths)
precomputation for fastsum
static C SearchTree(const int d, const int t, const R *x, const C *alpha, const R *xmin, const R *xmax, const int N, const kernel k, const R *param, const int Ad, const C *Add, const int p, const unsigned flags)
fast search in tree of source knots for near field computation
void fastsum_finalize_source_nodes(fastsum_plan *ths)
finalization of fastsum plan
R * y
target knots in d-ball with radius 1/4-eps_b/2
C kubintkern(const R x, const C *Add, const int Ad, const R a)
linear spline interpolation in near field with even kernels
int n
FS__ - fast summation.
void fastsum_init_guru_kernel(fastsum_plan *ths, int d, kernel k, R *param, unsigned flags, int nn, int p, R eps_I, R eps_B)
initialize node independent part of fast summation plan
void fastsum_init_guru_target_nodes(fastsum_plan *ths, int M_total, int nn_oversampled, int m)
initialize target nodes dependent part of fast summation plan
Header file for the nfft3 library.
int p
degree of smoothness of regularization
static void BuildBox(fastsum_plan *ths)
initialize box-based search data structures
void fastsum_finalize(fastsum_plan *ths)
finalization of fastsum plan
void fastsum_exact(fastsum_plan *ths)
direct computation of sums
static void quicksort(int d, int t, R *x, C *alpha, int *permutation_x_alpha, int N)
quicksort algorithm for source knots and associated coefficients
static void BuildTree(int d, int t, R *x, C *alpha, int *permutation_x_alpha, int N)
recursive sort of source knots dimension by dimension to get tree structure
R * x
source knots in d-ball with radius 1/4-eps_b/2