44 #define X(name) NFFT(name)
47 static inline INT intprod(
const INT *vec,
const INT a,
const INT d)
52 for (t = 0; t < d; t++)
59 #define BASE(x) CEXP(x)
75 static inline void sort0(
const INT d,
const INT *n,
const INT m,
76 const INT local_x_num,
const R *local_x, INT *ar_x)
78 INT u_j[d], i, j, help, rhigh;
82 for (i = 0; i < local_x_num; i++)
86 for (j = 0; j < d; j++)
88 help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
89 u_j[j] = (help % n[j] + n[j]) % n[j];
91 ar_x[2 * i] += u_j[j];
93 ar_x[2 * i] *= n[j + 1];
97 for (j = 0, nprod = 1; j < d; j++)
100 rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
102 ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) *
sizeof(INT));
103 Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
105 for (i = 1; i < local_x_num; i++)
106 assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
119 static inline void sort(
const X(plan) *ths)
121 if (ths->flags & NFFT_SORT_NODES)
122 sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
145 void X(trafo_direct)(
const X(plan) *ths)
147 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
149 memset(f, 0, (
size_t)(ths->M_total) *
sizeof(C));
156 #pragma omp parallel for default(shared) private(j)
158 for (j = 0; j < ths->M_total; j++)
161 for (k_L = 0; k_L < ths->N_total; k_L++)
163 R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
164 f[j] += f_hat[k_L] * BASE(-II * omega);
173 #pragma omp parallel for default(shared) private(j)
175 for (j = 0; j < ths->M_total; j++)
177 R x[ths->d], omega, Omega[ths->d + 1];
178 INT t, t2, k_L, k[ths->d];
180 for (t = 0; t < ths->d; t++)
183 x[t] = K2PI * ths->x[j * ths->d + t];
184 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
186 omega = Omega[ths->d];
188 for (k_L = 0; k_L < ths->N_total; k_L++)
190 f[j] += f_hat[k_L] * BASE(-II * omega);
192 for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
197 for (t2 = t; t2 < ths->d; t2++)
198 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
200 omega = Omega[ths->d];
207 void X(adjoint_direct)(
const X(plan) *ths)
209 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
211 memset(f_hat, 0, (
size_t)(ths->N_total) *
sizeof(C));
218 #pragma omp parallel for default(shared) private(k_L)
219 for (k_L = 0; k_L < ths->N_total; k_L++)
222 for (j = 0; j < ths->M_total; j++)
224 R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
225 f_hat[k_L] += f[j] * BASE(II * omega);
230 for (j = 0; j < ths->M_total; j++)
233 for (k_L = 0; k_L < ths->N_total; k_L++)
235 R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
236 f_hat[k_L] += f[j] * BASE(II * omega);
246 #pragma omp parallel for default(shared) private(j, k_L)
247 for (k_L = 0; k_L < ths->N_total; k_L++)
249 INT k[ths->d], k_temp, t;
253 for (t = ths->d - 1; t >= 0; t--)
255 k[t] = k_temp % ths->N[t] - ths->N[t]/2;
259 for (j = 0; j < ths->M_total; j++)
262 for (t = 0; t < ths->d; t++)
263 omega += k[t] * K2PI * ths->x[j * ths->d + t];
264 f_hat[k_L] += f[j] * BASE(II * omega);
268 for (j = 0; j < ths->M_total; j++)
270 R x[ths->d], omega, Omega[ths->d+1];
271 INT t, t2, k[ths->d];
273 for (t = 0; t < ths->d; t++)
276 x[t] = K2PI * ths->x[j * ths->d + t];
277 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
279 omega = Omega[ths->d];
280 for (k_L = 0; k_L < ths->N_total; k_L++)
282 f_hat[k_L] += f[j] * BASE(II * omega);
284 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
289 for (t2 = t; t2 < ths->d; t2++)
290 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
292 omega = Omega[ths->d];
324 static inline void uo(
const X(plan) *ths,
const INT j, INT *up, INT *op,
327 const R xj = ths->x[j * ths->d + act_dim];
328 INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
330 (*up) = c - (ths->m);
331 (*op) = c + 1 + (ths->m);
334 static inline void uo2(INT *u, INT *o,
const R x,
const INT n,
const INT m)
336 INT c = LRINT(FLOOR(x * (R)(n)));
338 *u = (c - m + n) % n;
339 *o = (c + 1 + m + n) % n;
342 #define MACRO_D_compute_A \
344 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
347 #define MACRO_D_compute_T \
349 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
360 #define MACRO_init_k_ks \
362 for (t = ths->d-1; 0 <= t; t--) \
365 ks[t] = ths->N[t]/2; \
370 #define MACRO_update_c_phi_inv_k(which_one) \
372 for (t2 = t; t2 < ths->d; t2++) \
374 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
375 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
376 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
380 #define MACRO_count_k_ks \
382 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
385 ks[t]= ths->N[t]/2; \
388 kp[t]++; k[t]++; ks[t]++; \
389 if(kp[t] == ths->N[t]/2) \
391 k[t] = ths->n[t] - ths->N[t]/2; \
397 #define MACRO_D(which_one) \
398 static inline void D_serial_ ## which_one (X(plan) *ths) \
401 R c_phi_inv_k[ths->d+1]; \
407 INT k_plain[ths->d+1]; \
408 INT ks_plain[ths->d+1]; \
410 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
411 MACRO_D_init_result_ ## which_one; \
413 c_phi_inv_k[0] = K(1.0); \
419 if (ths->flags & PRE_PHI_HUT) \
421 for (k_L = 0; k_L < ths->N_total; k_L++) \
423 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
424 MACRO_D_compute_ ## which_one; \
430 for (k_L = 0; k_L < ths->N_total; k_L++) \
432 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
433 MACRO_D_compute_ ## which_one; \
440 static inline void D_openmp_A(
X(plan) *ths)
445 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
446 memset(g_hat, 0, ths->n_total *
sizeof(C));
450 #pragma omp parallel for default(shared) private(k_L)
451 for (k_L = 0; k_L < ths->N_total; k_L++)
456 R c_phi_inv_k_val = K(1.0);
458 INT ks_plain_val = 0;
462 for (t = ths->d-1; t >= 0; t--)
464 kp[t] = k_temp % ths->N[t];
465 if (kp[t] >= ths->N[t]/2)
466 k[t] = ths->n[t] - ths->N[t] + kp[t];
469 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
473 for (t = 0; t < ths->d; t++)
475 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
476 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
477 k_plain_val = k_plain_val*ths->n[t] + k[t];
480 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
485 #pragma omp parallel for default(shared) private(k_L)
486 for (k_L = 0; k_L < ths->N_total; k_L++)
491 R c_phi_inv_k_val = K(1.0);
493 INT ks_plain_val = 0;
497 for (t = ths->d-1; t >= 0; t--)
499 kp[t] = k_temp % ths->N[t];
500 if (kp[t] >= ths->N[t]/2)
501 k[t] = ths->n[t] - ths->N[t] + kp[t];
504 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
508 for (t = 0; t < ths->d; t++)
510 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
511 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
512 k_plain_val = k_plain_val*ths->n[t] + k[t];
515 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
525 static inline void D_A(
X(plan) *ths)
535 static void D_openmp_T(
X(plan) *ths)
540 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
541 memset(f_hat, 0, ths->N_total *
sizeof(C));
545 #pragma omp parallel for default(shared) private(k_L)
546 for (k_L = 0; k_L < ths->N_total; k_L++)
551 R c_phi_inv_k_val = K(1.0);
553 INT ks_plain_val = 0;
557 for (t = ths->d - 1; t >= 0; t--)
559 kp[t] = k_temp % ths->N[t];
560 if (kp[t] >= ths->N[t]/2)
561 k[t] = ths->n[t] - ths->N[t] + kp[t];
564 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
568 for (t = 0; t < ths->d; t++)
570 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
571 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
572 k_plain_val = k_plain_val*ths->n[t] + k[t];
575 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
580 #pragma omp parallel for default(shared) private(k_L)
581 for (k_L = 0; k_L < ths->N_total; k_L++)
586 R c_phi_inv_k_val = K(1.0);
588 INT ks_plain_val = 0;
592 for (t = ths->d-1; t >= 0; t--)
594 kp[t] = k_temp % ths->N[t];
595 if (kp[t] >= ths->N[t]/2)
596 k[t] = ths->n[t] - ths->N[t] + kp[t];
599 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
603 for (t = 0; t < ths->d; t++)
605 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
606 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
607 k_plain_val = k_plain_val*ths->n[t] + k[t];
610 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
620 static void D_T(
X(plan) *ths)
630 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
631 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
633 #define MACRO_B_PRE_FULL_PSI_compute_A \
635 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
638 #define MACRO_B_PRE_FULL_PSI_compute_T \
640 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
643 #define MACRO_B_compute_A \
645 (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
648 #define MACRO_B_compute_T \
650 g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
657 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
658 - ((R)l[t2])/((R)ths->n[t2]), t2)
660 #define MACRO_init_uo_l_lj_t \
662 for (t = ths->d-1; t >= 0; t--) \
664 uo(ths,j,&u[t],&o[t],t); \
671 #define MACRO_update_phi_prod_ll_plain(which_one) { \
672 for (t2 = t; t2 < ths->d; t2++) \
674 phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
675 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \
679 #define MACRO_count_uo_l_lj_t \
681 for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
691 #define MACRO_B(which_one) \
692 static inline void B_serial_ ## which_one (X(plan) *ths) \
695 INT u[ths->d], o[ths->d]; \
701 INT ll_plain[ths->d+1]; \
702 R phi_prod[ths->d+1]; \
706 R fg_psi[ths->d][2*ths->m+2]; \
707 R fg_exp_l[ths->d][2*ths->m+2]; \
709 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
712 INT ip_s = ths->K/(ths->m+2); \
714 f = (C*)ths->f; g = (C*)ths->g; \
716 MACRO_B_init_result_ ## which_one; \
718 if (ths->flags & PRE_FULL_PSI) \
720 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
722 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
724 MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
730 phi_prod[0] = K(1.0); \
733 for (t = 0, lprod = 1; t < ths->d; t++) \
734 lprod *= (2 * ths->m + 2); \
736 if (ths->flags & PRE_PSI) \
738 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
740 MACRO_init_uo_l_lj_t; \
742 for (l_L = 0; l_L < lprod; l_L++) \
744 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
746 MACRO_B_compute_ ## which_one; \
748 MACRO_count_uo_l_lj_t; \
754 if (ths->flags & PRE_FG_PSI) \
756 for(t2 = 0; t2 < ths->d; t2++) \
758 tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
759 tmpEXP2sq = tmpEXP2*tmpEXP2; \
762 fg_exp_l[t2][0] = K(1.0); \
763 for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
765 tmp3 = tmp2*tmpEXP2; \
767 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
770 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
772 MACRO_init_uo_l_lj_t; \
774 for (t2 = 0; t2 < ths->d; t2++) \
776 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
777 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
779 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
782 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
786 for (l_L= 0; l_L < lprod; l_L++) \
788 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
790 MACRO_B_compute_ ## which_one; \
792 MACRO_count_uo_l_lj_t; \
798 if (ths->flags & FG_PSI) \
800 for (t2 = 0; t2 < ths->d; t2++) \
802 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
803 tmpEXP2sq = tmpEXP2*tmpEXP2; \
806 fg_exp_l[t2][0] = K(1.0); \
807 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
809 tmp3 = tmp2*tmpEXP2; \
811 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
814 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
816 MACRO_init_uo_l_lj_t; \
818 for (t2 = 0; t2 < ths->d; t2++) \
820 fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
822 tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
825 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
828 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
832 for (l_L = 0; l_L < lprod; l_L++) \
834 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
836 MACRO_B_compute_ ## which_one; \
838 MACRO_count_uo_l_lj_t; \
844 if (ths->flags & PRE_LIN_PSI) \
846 for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
848 MACRO_init_uo_l_lj_t; \
850 for (t2 = 0; t2 < ths->d; t2++) \
852 y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
853 * ((R)(ths->K))) / (R)(ths->m + 2); \
854 ip_u = LRINT(FLOOR(y[t2])); \
856 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
858 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
859 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
864 for (l_L = 0; l_L < lprod; l_L++) \
866 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
868 MACRO_B_compute_ ## which_one; \
870 MACRO_count_uo_l_lj_t; \
877 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
879 MACRO_init_uo_l_lj_t; \
881 for (l_L = 0; l_L < lprod; l_L++) \
883 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
885 MACRO_B_compute_ ## which_one; \
887 MACRO_count_uo_l_lj_t; \
897 static inline void B_openmp_A (
X(plan) *ths)
902 memset(ths->f, 0, ths->M_total *
sizeof(C));
904 for (k = 0, lprod = 1; k < ths->d; k++)
905 lprod *= (2*ths->m+2);
909 #pragma omp parallel for default(shared) private(k)
910 for (k = 0; k < ths->M_total; k++)
913 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
915 for (l = 0; l < lprod; l++)
916 ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
923 #pragma omp parallel for default(shared) private(k)
924 for (k = 0; k < ths->M_total; k++)
926 INT u[ths->d], o[ths->d];
931 INT ll_plain[ths->d+1];
932 R phi_prod[ths->d+1];
933 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
935 phi_prod[0] = K(1.0);
938 MACRO_init_uo_l_lj_t;
940 for (l_L = 0; l_L < lprod; l_L++)
942 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
944 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
946 MACRO_count_uo_l_lj_t;
955 R fg_exp_l[ths->d][2*ths->m+2];
957 for (t2 = 0; t2 < ths->d; t2++)
960 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
961 R tmpEXP2sq = tmpEXP2*tmpEXP2;
964 fg_exp_l[t2][0] = K(1.0);
965 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
969 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
973 #pragma omp parallel for default(shared) private(k,t,t2)
974 for (k = 0; k < ths->M_total; k++)
976 INT ll_plain[ths->d+1];
977 R phi_prod[ths->d+1];
978 INT u[ths->d], o[ths->d];
981 R fg_psi[ths->d][2*ths->m+2];
985 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
987 phi_prod[0] = K(1.0);
990 MACRO_init_uo_l_lj_t;
992 for (t2 = 0; t2 < ths->d; t2++)
994 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
995 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
997 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1000 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1004 for (l_L= 0; l_L < lprod; l_L++)
1006 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1008 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1010 MACRO_count_uo_l_lj_t;
1019 R fg_exp_l[ths->d][2*ths->m+2];
1023 for (t2 = 0; t2 < ths->d; t2++)
1026 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1027 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1030 fg_exp_l[t2][0] = K(1.0);
1031 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1033 tmp3 = tmp2*tmpEXP2;
1035 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1039 #pragma omp parallel for default(shared) private(k,t,t2)
1040 for (k = 0; k < ths->M_total; k++)
1042 INT ll_plain[ths->d+1];
1043 R phi_prod[ths->d+1];
1044 INT u[ths->d], o[ths->d];
1047 R fg_psi[ths->d][2*ths->m+2];
1051 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1053 phi_prod[0] = K(1.0);
1056 MACRO_init_uo_l_lj_t;
1058 for (t2 = 0; t2 < ths->d; t2++)
1060 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1062 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1065 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1068 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1072 for (l_L = 0; l_L < lprod; l_L++)
1074 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1076 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1078 MACRO_count_uo_l_lj_t;
1088 #pragma omp parallel for default(shared) private(k)
1089 for (k = 0; k<ths->M_total; k++)
1091 INT u[ths->d], o[ths->d];
1096 INT ll_plain[ths->d+1];
1097 R phi_prod[ths->d+1];
1099 R fg_psi[ths->d][2*ths->m+2];
1103 INT ip_s = ths->K/(ths->m+2);
1104 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1106 phi_prod[0] = K(1.0);
1109 MACRO_init_uo_l_lj_t;
1111 for (t2 = 0; t2 < ths->d; t2++)
1113 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1114 * ((R)ths->K))/(ths->m+2);
1115 ip_u = LRINT(FLOOR(y[t2]));
1117 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1119 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1120 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1125 for (l_L = 0; l_L < lprod; l_L++)
1127 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1129 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1131 MACRO_count_uo_l_lj_t;
1140 #pragma omp parallel for default(shared) private(k)
1141 for (k = 0; k < ths->M_total; k++)
1143 INT u[ths->d], o[ths->d];
1148 INT ll_plain[ths->d+1];
1149 R phi_prod[ths->d+1];
1150 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1152 phi_prod[0] = K(1.0);
1155 MACRO_init_uo_l_lj_t;
1157 for (l_L = 0; l_L < lprod; l_L++)
1159 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1161 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1163 MACRO_count_uo_l_lj_t;
1169 static void B_A(
X(plan) *ths)
1194 static inline INT index_x_binary_search(
const INT *ar_x,
const INT len,
const INT key)
1196 INT left = 0, right = len - 1;
1201 while (left < right - 1)
1203 INT i = (left + right) / 2;
1204 if (ar_x[2*i] >= key)
1206 else if (ar_x[2*i] < key)
1210 if (ar_x[2*left] < key && left != len-1)
1233 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1234 INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b,
const INT d,
1235 const INT *n,
const INT m)
1237 const INT n0 = n[0];
1239 INT nthreads = omp_get_num_threads();
1240 INT nthreads_used = MIN(nthreads, n0);
1241 INT size_per_thread = n0 / nthreads_used;
1242 INT size_left = n0 - size_per_thread * nthreads_used;
1243 INT size_g[nthreads_used];
1244 INT offset_g[nthreads_used];
1245 INT my_id = omp_get_thread_num();
1246 INT n_prod_rest = 1;
1248 for (k = 1; k < d; k++)
1249 n_prod_rest *= n[k];
1258 if (my_id < nthreads_used)
1260 const INT m22 = 2 * m + 2;
1263 for (k = 0; k < nthreads_used; k++)
1266 offset_g[k] = offset_g[k-1] + size_g[k-1];
1267 size_g[k] = size_per_thread;
1275 *my_u0 = offset_g[my_id];
1276 *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1278 if (nthreads_used > 1)
1280 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1281 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1286 *max_u_a = n_prod_rest * n0 - 1;
1291 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1292 *max_u_b = n_prod_rest * n0 - 1;
1296 if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1298 *max_u_a = *max_u_b;
1303 assert(*min_u_a <= *max_u_a);
1304 assert(*min_u_b <= *max_u_b);
1305 assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1319 static void nfft_adjoint_B_compute_full_psi(C *g,
const INT *psi_index_g,
1320 const R *psi,
const C *f,
const INT M,
const INT d,
const INT *n,
1321 const INT m,
const unsigned flags,
const INT *index_x)
1333 for(t = 0, lprod = 1; t < d; t++)
1337 lprod_m1 = lprod / (2 * m + 2);
1341 if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1343 #pragma omp parallel private(k)
1345 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1346 const INT *ar_x = index_x;
1347 INT n_prod_rest = 1;
1349 for (k = 1; k < d; k++)
1350 n_prod_rest *= n[k];
1352 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1356 k = index_x_binary_search(ar_x, M, min_u_a);
1358 assert(ar_x[2*k] >= min_u_a || k == M-1);
1360 assert(ar_x[2*k-2] < min_u_a);
1365 INT u_prod = ar_x[2*k];
1366 INT j = ar_x[2*k+1];
1368 if (u_prod < min_u_a || u_prod > max_u_a)
1371 for (l0 = 0; l0 < 2 * m + 2; l0++)
1373 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1375 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1378 for (lrest = 0; lrest < lprod_m1; lrest++)
1380 const INT l = l0 * lprod_m1 + lrest;
1381 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1391 k = index_x_binary_search(ar_x, M, min_u_b);
1393 assert(ar_x[2*k] >= min_u_b || k == M-1);
1395 assert(ar_x[2*k-2] < min_u_b);
1400 INT u_prod = ar_x[2*k];
1401 INT j = ar_x[2*k+1];
1403 if (u_prod < min_u_b || u_prod > max_u_b)
1406 for (l0 = 0; l0 < 2 * m + 2; l0++)
1408 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1410 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1412 for (lrest = 0; lrest < lprod_m1; lrest++)
1414 const INT l = l0 * lprod_m1 + lrest;
1415 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1428 #pragma omp parallel for default(shared) private(k)
1430 for (k = 0; k < M; k++)
1433 INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1435 for (l = 0; l < lprod; l++)
1438 C val = psi[j * lprod + l] * f[j];
1439 C *gref = g + psi_index_g[j * lprod + l];
1440 R *gref_real = (R*) gref;
1443 gref_real[0] += CREAL(val);
1446 gref_real[1] += CIMAG(val);
1448 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1459 static inline void B_openmp_T(
X(plan) *ths)
1464 memset(ths->g, 0, (
size_t)(ths->n_total) *
sizeof(C));
1466 for (k = 0, lprod = 1; k < ths->d; k++)
1467 lprod *= (2*ths->m+2);
1471 nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1472 ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1478 #pragma omp parallel for default(shared) private(k)
1479 for (k = 0; k < ths->M_total; k++)
1481 INT u[ths->d], o[ths->d];
1486 INT ll_plain[ths->d+1];
1487 R phi_prod[ths->d+1];
1488 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1490 phi_prod[0] = K(1.0);
1493 MACRO_init_uo_l_lj_t;
1495 for (l_L = 0; l_L < lprod; l_L++)
1501 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1503 lhs = ths->g + ll_plain[ths->d];
1505 val = phi_prod[ths->d] * ths->f[j];
1508 lhs_real[0] += CREAL(val);
1511 lhs_real[1] += CIMAG(val);
1513 MACRO_count_uo_l_lj_t;
1522 R fg_exp_l[ths->d][2*ths->m+2];
1523 for(t2 = 0; t2 < ths->d; t2++)
1526 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1527 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1530 fg_exp_l[t2][0] = K(1.0);
1531 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1533 tmp3 = tmp2*tmpEXP2;
1535 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1539 #pragma omp parallel for default(shared) private(k,t,t2)
1540 for (k = 0; k < ths->M_total; k++)
1542 INT ll_plain[ths->d+1];
1543 R phi_prod[ths->d+1];
1544 INT u[ths->d], o[ths->d];
1547 R fg_psi[ths->d][2*ths->m+2];
1551 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1553 phi_prod[0] = K(1.0);
1556 MACRO_init_uo_l_lj_t;
1558 for (t2 = 0; t2 < ths->d; t2++)
1560 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1561 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1563 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1566 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1570 for (l_L= 0; l_L < lprod; l_L++)
1576 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1578 lhs = ths->g + ll_plain[ths->d];
1580 val = phi_prod[ths->d] * ths->f[j];
1583 lhs_real[0] += CREAL(val);
1586 lhs_real[1] += CIMAG(val);
1588 MACRO_count_uo_l_lj_t;
1597 R fg_exp_l[ths->d][2*ths->m+2];
1601 for (t2 = 0; t2 < ths->d; t2++)
1604 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1605 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1608 fg_exp_l[t2][0] = K(1.0);
1609 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1611 tmp3 = tmp2*tmpEXP2;
1613 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1617 #pragma omp parallel for default(shared) private(k,t,t2)
1618 for (k = 0; k < ths->M_total; k++)
1620 INT ll_plain[ths->d+1];
1621 R phi_prod[ths->d+1];
1622 INT u[ths->d], o[ths->d];
1625 R fg_psi[ths->d][2*ths->m+2];
1629 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1631 phi_prod[0] = K(1.0);
1634 MACRO_init_uo_l_lj_t;
1636 for (t2 = 0; t2 < ths->d; t2++)
1638 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1640 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1643 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1646 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1650 for (l_L = 0; l_L < lprod; l_L++)
1656 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1658 lhs = ths->g + ll_plain[ths->d];
1660 val = phi_prod[ths->d] * ths->f[j];
1663 lhs_real[0] += CREAL(val);
1666 lhs_real[1] += CIMAG(val);
1668 MACRO_count_uo_l_lj_t;
1678 #pragma omp parallel for default(shared) private(k)
1679 for (k = 0; k<ths->M_total; k++)
1681 INT u[ths->d], o[ths->d];
1686 INT ll_plain[ths->d+1];
1687 R phi_prod[ths->d+1];
1689 R fg_psi[ths->d][2*ths->m+2];
1693 INT ip_s = ths->K/(ths->m+2);
1694 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1696 phi_prod[0] = K(1.0);
1699 MACRO_init_uo_l_lj_t;
1701 for (t2 = 0; t2 < ths->d; t2++)
1703 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1704 * ((R)ths->K))/(ths->m+2);
1705 ip_u = LRINT(FLOOR(y[t2]));
1707 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1709 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1710 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1715 for (l_L = 0; l_L < lprod; l_L++)
1721 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1723 lhs = ths->g + ll_plain[ths->d];
1725 val = phi_prod[ths->d] * ths->f[j];
1728 lhs_real[0] += CREAL(val);
1731 lhs_real[1] += CIMAG(val);
1733 MACRO_count_uo_l_lj_t;
1742 #pragma omp parallel for default(shared) private(k)
1743 for (k = 0; k < ths->M_total; k++)
1745 INT u[ths->d], o[ths->d];
1750 INT ll_plain[ths->d+1];
1751 R phi_prod[ths->d+1];
1752 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1754 phi_prod[0] = K(1.0);
1757 MACRO_init_uo_l_lj_t;
1759 for (l_L = 0; l_L < lprod; l_L++)
1765 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1767 lhs = ths->g + ll_plain[ths->d];
1769 val = phi_prod[ths->d] * ths->f[j];
1772 lhs_real[0] += CREAL(val);
1775 lhs_real[1] += CIMAG(val);
1777 MACRO_count_uo_l_lj_t;
1783 static void B_T(
X(plan) *ths)
1794 static void nfft_1d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
1796 const INT tmp2 = 2*m+2;
1798 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1800 fg_exp_b0 = EXP(K(-1.0)/b);
1801 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1802 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1804 for (l = 1; l < tmp2; l++)
1806 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1807 fg_exp_b1 *= fg_exp_b0_sq;
1808 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1813 static void nfft_trafo_1d_compute(C *fj,
const C *g,
const R *psij_const,
1814 const R *xj,
const INT n,
const INT m)
1821 uo2(&u, &o, *xj, n, m);
1825 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1826 (*fj) += (*psij++) * (*gj++);
1830 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1831 (*fj) += (*psij++) * (*gj++);
1832 for (l = 0, gj = g; l <= o; l++)
1833 (*fj) += (*psij++) * (*gj++);
1838 static void nfft_adjoint_1d_compute_serial(
const C *fj, C *g,
1839 const R *psij_const,
const R *xj,
const INT n,
const INT m)
1846 uo2(&u,&o,*xj, n, m);
1850 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1851 (*gj++) += (*psij++) * (*fj);
1855 for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1856 (*gj++) += (*psij++) * (*fj);
1857 for (l = 0, gj = g; l <= o; l++)
1858 (*gj++) += (*psij++) * (*fj);
1865 static void nfft_adjoint_1d_compute_omp_atomic(
const C f, C *g,
1866 const R *psij_const,
const R *xj,
const INT n,
const INT m)
1870 INT index_temp[2*m+2];
1872 uo2(&u,&o,*xj, n, m);
1874 for (l=0; l<=2*m+1; l++)
1875 index_temp[l] = (l+u)%n;
1877 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1879 INT i = index_temp[l];
1881 R *lhs_real = (R*)lhs;
1882 C val = psij_const[l] * f;
1884 lhs_real[0] += CREAL(val);
1887 lhs_real[1] += CIMAG(val);
1908 static void nfft_adjoint_1d_compute_omp_blockwise(
const C f, C *g,
1909 const R *psij_const,
const R *xj,
const INT n,
const INT m,
1910 const INT my_u0,
const INT my_o0)
1914 uo2(&ar_u,&ar_o,*xj, n, m);
1918 INT u = MAX(my_u0,ar_u);
1919 INT o = MIN(my_o0,ar_o);
1920 INT offset_psij = u-ar_u;
1922 assert(offset_psij >= 0);
1923 assert(o-u <= 2*m+1);
1924 assert(offset_psij+o-u <= 2*m+1);
1927 for (l = 0; l <= o-u; l++)
1928 g[u+l] += psij_const[offset_psij+l] * f;
1932 INT u = MAX(my_u0,ar_u);
1934 INT offset_psij = u-ar_u;
1936 assert(offset_psij >= 0);
1937 assert(o-u <= 2*m+1);
1938 assert(offset_psij+o-u <= 2*m+1);
1941 for (l = 0; l <= o-u; l++)
1942 g[u+l] += psij_const[offset_psij+l] * f;
1945 o = MIN(my_o0,ar_o);
1946 offset_psij += my_u0-ar_u+n;
1951 assert(o-u <= 2*m+1);
1952 if (offset_psij+o-u > 2*m+1)
1954 fprintf(stderr,
"ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
1956 assert(offset_psij+o-u <= 2*m+1);
1959 for (l = 0; l <= o-u; l++)
1960 g[u+l] += psij_const[offset_psij+l] * f;
1965 static void nfft_trafo_1d_B(
X(plan) *ths)
1967 const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
1968 const C *g = (C*)ths->g;
1974 #pragma omp parallel for default(shared) private(k)
1976 for (k = 0; k < M; k++)
1979 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1981 for (l = 0; l < m2p2; l++)
1982 ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
1991 #pragma omp parallel for default(shared) private(k)
1993 for (k = 0; k < M; k++)
1995 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1996 nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
2007 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2010 #pragma omp parallel for default(shared) private(k)
2012 for (k = 0; k < M; k++)
2014 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2015 const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2016 R fg_psij2 = K(1.0);
2020 psij_const[0] = fg_psij0;
2022 for (l = 1; l < m2p2; l++)
2024 fg_psij2 *= fg_psij1;
2025 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2028 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2041 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2044 #pragma omp parallel for default(shared) private(k)
2046 for (k = 0; k < M; k++)
2048 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2050 R fg_psij0, fg_psij1, fg_psij2;
2053 uo(ths, (INT)j, &u, &o, (INT)0);
2054 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2055 fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2058 psij_const[0] = fg_psij0;
2060 for (l = 1; l < m2p2; l++)
2062 fg_psij2 *= fg_psij1;
2063 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2066 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2073 const INT K = ths->K, ip_s = K / (m + 2);
2079 #pragma omp parallel for default(shared) private(k)
2081 for (k = 0; k < M; k++)
2087 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2089 uo(ths, (INT)j, &u, &o, (INT)0);
2091 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2092 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2093 ip_w = ip_y - (R)(ip_u);
2095 for (l = 0; l < m2p2; l++)
2096 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2097 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2099 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2111 #pragma omp parallel for default(shared) private(k)
2113 for (k = 0; k < M; k++)
2117 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2119 uo(ths, (INT)j, &u, &o, (INT)0);
2121 for (l = 0; l < m2p2; l++)
2122 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2124 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2131 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2133 assert(ar_x[2*k] >= min_u_a || k == M-1); \
2135 assert(ar_x[2*k-2] < min_u_a); \
2138 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2142 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2144 assert(ar_x[2*k] >= min_u_b || k == M-1); \
2146 assert(ar_x[2*k-2] < min_u_b); \
2149 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2152 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2154 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2155 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2158 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2160 R psij_const[2 * m + 2]; \
2162 R fg_psij0 = ths->psi[2 * j]; \
2163 R fg_psij1 = ths->psi[2 * j + 1]; \
2164 R fg_psij2 = K(1.0); \
2166 psij_const[0] = fg_psij0; \
2167 for (l = 1; l <= 2 * m + 1; l++) \
2169 fg_psij2 *= fg_psij1; \
2170 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2173 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2174 ths->x + j, n, m, my_u0, my_o0); \
2177 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2179 R psij_const[2 * m + 2]; \
2180 R fg_psij0, fg_psij1, fg_psij2; \
2183 uo(ths, j, &u, &o, (INT)0); \
2184 fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \
2185 fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2186 fg_psij2 = K(1.0); \
2187 psij_const[0] = fg_psij0; \
2188 for (l = 1; l <= 2 * m + 1; l++) \
2190 fg_psij2 *= fg_psij1; \
2191 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2194 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2195 ths->x + j, n, m, my_u0, my_o0); \
2198 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2200 R psij_const[2 * m + 2]; \
2205 uo(ths, j, &u, &o, (INT)0); \
2207 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2208 ip_u = LRINT(FLOOR(ip_y)); \
2209 ip_w = ip_y - ip_u; \
2210 for (l = 0; l < 2 * m + 2; l++) \
2212 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2213 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2215 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2216 ths->x + j, n, m, my_u0, my_o0); \
2219 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2221 R psij_const[2 * m + 2]; \
2224 uo(ths, j, &u, &o, (INT)0); \
2226 for (l = 0; l <= 2 * m + 1; l++) \
2227 psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \
2229 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2230 ths->x + j, n, m, my_u0, my_o0); \
2233 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2235 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2237 _Pragma("omp parallel private(k)") \
2239 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2240 INT *ar_x = ths->index_x; \
2242 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2243 &min_u_b, &max_u_b, 1, &n, m); \
2245 if (min_u_a != -1) \
2247 k = index_x_binary_search(ar_x, M, min_u_a); \
2249 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2253 INT u_prod = ar_x[2*k]; \
2254 INT j = ar_x[2*k+1]; \
2256 if (u_prod < min_u_a || u_prod > max_u_a) \
2259 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2265 if (min_u_b != -1) \
2267 k = index_x_binary_search(ar_x, M, min_u_b); \
2269 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2273 INT u_prod = ar_x[2*k]; \
2274 INT j = ar_x[2*k+1]; \
2276 if (u_prod < min_u_b || u_prod > max_u_b) \
2279 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2289 static void nfft_adjoint_1d_B(
X(plan) *ths)
2291 const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2295 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
2299 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2300 (INT)1, ths->n, m, ths->flags, ths->index_x);
2307 MACRO_adjoint_1d_B_OMP_BLOCKWISE(
PRE_PSI)
2311 #pragma omp parallel for default(shared) private(k)
2313 for (k = 0; k < M; k++)
2315 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2317 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2319 nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2328 R fg_exp_l[2 * m + 2];
2330 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2338 #pragma omp parallel for default(shared) private(k)
2340 for (k = 0; k < M; k++)
2342 R psij_const[2 * m + 2];
2343 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2345 R fg_psij0 = ths->psi[2 * j];
2346 R fg_psij1 = ths->psi[2 * j + 1];
2347 R fg_psij2 = K(1.0);
2349 psij_const[0] = fg_psij0;
2350 for (l = 1; l <= 2 * m + 1; l++)
2352 fg_psij2 *= fg_psij1;
2353 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2357 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2359 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2368 R fg_exp_l[2 * m + 2];
2370 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2375 MACRO_adjoint_1d_B_OMP_BLOCKWISE(
FG_PSI)
2379 #pragma omp parallel for default(shared) private(k)
2381 for (k = 0; k < M; k++)
2384 R psij_const[2 * m + 2];
2385 R fg_psij0, fg_psij1, fg_psij2;
2386 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2388 uo(ths, j, &u, &o, (INT)0);
2389 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2390 fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2392 psij_const[0] = fg_psij0;
2393 for (l = 1; l <= 2 * m + 1; l++)
2395 fg_psij2 *= fg_psij1;
2396 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2400 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2402 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2411 const INT K = ths->K;
2412 const INT ip_s = K / (m + 2);
2421 #pragma omp parallel for default(shared) private(k)
2423 for (k = 0; k < M; k++)
2428 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2429 R psij_const[2 * m + 2];
2431 uo(ths, j, &u, &o, (INT)0);
2433 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2434 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2435 ip_w = ip_y - (R)(ip_u);
2436 for (l = 0; l < 2 * m + 2; l++)
2438 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2439 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2442 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2444 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2454 MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2458 #pragma omp parallel for default(shared) private(k)
2460 for (k = 0; k < M; k++)
2463 R psij_const[2 * m + 2];
2464 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2466 uo(ths, j, &u, &o, (INT)0);
2468 for (l = 0; l <= 2 * m + 1; l++)
2469 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2472 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2474 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2479 void X(trafo_1d)(
X(plan) *ths)
2481 if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2483 X(trafo_direct)(ths);
2487 const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2488 C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2490 ths->g_hat = ths->g1;
2494 C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2495 R *c_phi_inv1, *c_phi_inv2;
2501 #pragma omp parallel for default(shared) private(k)
2502 for (k = 0; k < ths->n_total; k++)
2503 ths->g_hat[k] = 0.0;
2506 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
2511 c_phi_inv1 = ths->c_phi_inv[0];
2512 c_phi_inv2 = &ths->c_phi_inv[0][N2];
2515 #pragma omp parallel for default(shared) private(k)
2517 for (k = 0; k < N2; k++)
2519 g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2520 g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2527 #pragma omp parallel for default(shared) private(k)
2529 for (k = 0; k < N2; k++)
2531 g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2532 g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2538 FFTW(execute)(ths->my_fftw_plan1);
2542 nfft_trafo_1d_B(ths);
2547 void X(adjoint_1d)(
X(plan) *ths)
2549 if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2551 X(adjoint_direct)(ths);
2556 C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2557 R *c_phi_inv1, *c_phi_inv2;
2565 f_hat1=(C*)ths->f_hat;
2566 f_hat2=(C*)&ths->f_hat[N/2];
2567 g_hat1=(C*)&ths->g_hat[n-N/2];
2568 g_hat2=(C*)ths->g_hat;
2571 nfft_adjoint_1d_B(ths);
2575 FFTW(execute)(ths->my_fftw_plan2);
2582 c_phi_inv1=ths->c_phi_inv[0];
2583 c_phi_inv2=&ths->c_phi_inv[0][N/2];
2586 #pragma omp parallel for default(shared) private(k)
2588 for (k = 0; k < N/2; k++)
2590 f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2591 f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2599 #pragma omp parallel for default(shared) private(k)
2601 for (k = 0; k < N/2; k++)
2603 f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2604 f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2613 static void nfft_2d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
2616 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2618 fg_exp_b0 = EXP(K(-1.0)/b);
2619 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2622 fg_exp_l[0] = K(1.0);
2623 for(l=1; l <= 2*m+1; l++)
2625 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2626 fg_exp_b1 *= fg_exp_b0_sq;
2627 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2631 static void nfft_trafo_2d_compute(C *fj,
const C *g,
const R *psij_const0,
2632 const R *psij_const1,
const R *xj0,
const R *xj1,
const INT n0,
2633 const INT n1,
const INT m)
2635 INT u0,o0,l0,u1,o1,l1;
2637 const R *psij0,*psij1;
2642 uo2(&u0,&o0,*xj0, n0, m);
2643 uo2(&u1,&o1,*xj1, n1, m);
2649 for(l0=0; l0<=2*m+1; l0++,psij0++)
2653 for(l1=0; l1<=2*m+1; l1++)
2654 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2657 for(l0=0; l0<=2*m+1; l0++,psij0++)
2661 for(l1=0; l1<2*m+1-o1; l1++)
2662 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2664 for(l1=0; l1<=o1; l1++)
2665 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2670 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2674 for(l1=0; l1<=2*m+1; l1++)
2675 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2677 for(l0=0; l0<=o0; l0++,psij0++)
2681 for(l1=0; l1<=2*m+1; l1++)
2682 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2687 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2691 for(l1=0; l1<2*m+1-o1; l1++)
2692 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2694 for(l1=0; l1<=o1; l1++)
2695 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2697 for(l0=0; l0<=o0; l0++,psij0++)
2701 for(l1=0; l1<2*m+1-o1; l1++)
2702 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2704 for(l1=0; l1<=o1; l1++)
2705 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2712 static void nfft_adjoint_2d_compute_omp_atomic(
const C f, C *g,
2713 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2714 const R *xj1,
const INT n0,
const INT n1,
const INT m)
2716 INT u0,o0,l0,u1,o1,l1;
2717 const INT lprod = (2*m+2) * (2*m+2);
2719 INT index_temp0[2*m+2];
2720 INT index_temp1[2*m+2];
2722 uo2(&u0,&o0,*xj0, n0, m);
2723 uo2(&u1,&o1,*xj1, n1, m);
2725 for (l0=0; l0<=2*m+1; l0++)
2726 index_temp0[l0] = (u0+l0)%n0;
2728 for (l1=0; l1<=2*m+1; l1++)
2729 index_temp1[l1] = (u1+l1)%n1;
2731 for(l0=0; l0<=2*m+1; l0++)
2733 for(l1=0; l1<=2*m+1; l1++)
2735 INT i = index_temp0[l0] * n1 + index_temp1[l1];
2737 R *lhs_real = (R*)lhs;
2738 C val = psij_const0[l0] * psij_const1[l1] * f;
2741 lhs_real[0] += CREAL(val);
2744 lhs_real[1] += CIMAG(val);
2769 static void nfft_adjoint_2d_compute_omp_blockwise(
const C f, C *g,
2770 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2771 const R *xj1,
const INT n0,
const INT n1,
const INT m,
2772 const INT my_u0,
const INT my_o0)
2774 INT ar_u0,ar_o0,l0,u1,o1,l1;
2775 const INT lprod = (2*m+2) * (2*m+2);
2776 INT index_temp1[2*m+2];
2778 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2779 uo2(&u1,&o1,*xj1, n1, m);
2781 for (l1 = 0; l1 <= 2*m+1; l1++)
2782 index_temp1[l1] = (u1+l1)%n1;
2786 INT u0 = MAX(my_u0,ar_u0);
2787 INT o0 = MIN(my_o0,ar_o0);
2788 INT offset_psij = u0-ar_u0;
2790 assert(offset_psij >= 0);
2791 assert(o0-u0 <= 2*m+1);
2792 assert(offset_psij+o0-u0 <= 2*m+1);
2795 for (l0 = 0; l0 <= o0-u0; l0++)
2797 INT i0 = (u0+l0) * n1;
2798 const C val0 = psij_const0[offset_psij+l0];
2800 for(l1=0; l1<=2*m+1; l1++)
2801 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2806 INT u0 = MAX(my_u0,ar_u0);
2808 INT offset_psij = u0-ar_u0;
2810 assert(offset_psij >= 0);
2811 assert(o0-u0 <= 2*m+1);
2812 assert(offset_psij+o0-u0 <= 2*m+1);
2815 for (l0 = 0; l0 <= o0-u0; l0++)
2817 INT i0 = (u0+l0) * n1;
2818 const C val0 = psij_const0[offset_psij+l0];
2820 for(l1=0; l1<=2*m+1; l1++)
2821 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2825 o0 = MIN(my_o0,ar_o0);
2826 offset_psij += my_u0-ar_u0+n0;
2831 assert(o0-u0 <= 2*m+1);
2832 assert(offset_psij+o0-u0 <= 2*m+1);
2836 for (l0 = 0; l0 <= o0-u0; l0++)
2838 INT i0 = (u0+l0) * n1;
2839 const C val0 = psij_const0[offset_psij+l0];
2841 for(l1=0; l1<=2*m+1; l1++)
2842 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2849 static void nfft_adjoint_2d_compute_serial(
const C *fj, C *g,
2850 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2851 const R *xj1,
const INT n0,
const INT n1,
const INT m)
2853 INT u0,o0,l0,u1,o1,l1;
2855 const R *psij0,*psij1;
2860 uo2(&u0,&o0,*xj0, n0, m);
2861 uo2(&u1,&o1,*xj1, n1, m);
2865 for(l0=0; l0<=2*m+1; l0++,psij0++)
2869 for(l1=0; l1<=2*m+1; l1++)
2870 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2873 for(l0=0; l0<=2*m+1; l0++,psij0++)
2877 for(l1=0; l1<2*m+1-o1; l1++)
2878 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2880 for(l1=0; l1<=o1; l1++)
2881 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2886 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2890 for(l1=0; l1<=2*m+1; l1++)
2891 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2893 for(l0=0; l0<=o0; l0++,psij0++)
2897 for(l1=0; l1<=2*m+1; l1++)
2898 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2903 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2907 for(l1=0; l1<2*m+1-o1; l1++)
2908 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2910 for(l1=0; l1<=o1; l1++)
2911 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2913 for(l0=0; l0<=o0; l0++,psij0++)
2917 for(l1=0; l1<2*m+1-o1; l1++)
2918 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2920 for(l1=0; l1<=o1; l1++)
2921 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2927 static void nfft_trafo_2d_B(
X(plan) *ths)
2929 const C *g = (C*)ths->g;
2930 const INT n0 = ths->n[0];
2931 const INT n1 = ths->n[1];
2932 const INT M = ths->M_total;
2933 const INT m = ths->m;
2939 const INT lprod = (2*m+2) * (2*m+2);
2941 #pragma omp parallel for default(shared) private(k)
2943 for (k = 0; k < M; k++)
2946 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2948 for (l = 0; l < lprod; l++)
2949 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
2957 #pragma omp parallel for default(shared) private(k)
2959 for (k = 0; k < M; k++)
2961 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2962 nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2970 R fg_exp_l[2*(2*m+2)];
2972 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2973 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
2976 #pragma omp parallel for default(shared) private(k)
2978 for (k = 0; k < M; k++)
2980 R psij_const[2*(2*m+2)];
2981 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2983 R fg_psij0 = ths->psi[2*j*2];
2984 R fg_psij1 = ths->psi[2*j*2+1];
2985 R fg_psij2 = K(1.0);
2987 psij_const[0] = fg_psij0;
2988 for (l = 1; l <= 2*m+1; l++)
2990 fg_psij2 *= fg_psij1;
2991 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
2994 fg_psij0 = ths->psi[2*(j*2+1)];
2995 fg_psij1 = ths->psi[2*(j*2+1)+1];
2997 psij_const[2*m+2] = fg_psij0;
2998 for (l = 1; l <= 2*m+1; l++)
3000 fg_psij2 *= fg_psij1;
3001 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3004 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3012 R fg_exp_l[2*(2*m+2)];
3014 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3015 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3020 #pragma omp parallel for default(shared) private(k)
3022 for (k = 0; k < M; k++)
3025 R fg_psij0, fg_psij1, fg_psij2;
3026 R psij_const[2*(2*m+2)];
3027 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3029 uo(ths, j, &u, &o, (INT)0);
3030 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3031 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3033 psij_const[0] = fg_psij0;
3034 for (l = 1; l <= 2*m+1; l++)
3036 fg_psij2 *= fg_psij1;
3037 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3040 uo(ths,j,&u,&o, (INT)1);
3041 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3042 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3044 psij_const[2*m+2] = fg_psij0;
3045 for(l=1; l<=2*m+1; l++)
3047 fg_psij2 *= fg_psij1;
3048 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3051 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3059 const INT K = ths->K, ip_s = K / (m + 2);
3064 #pragma omp parallel for default(shared) private(k)
3066 for (k = 0; k < M; k++)
3071 R psij_const[2*(2*m+2)];
3072 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3074 uo(ths,j,&u,&o,(INT)0);
3075 ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3076 ip_u = (INT)LRINT(FLOOR(ip_y));
3077 ip_w = ip_y - (R)(ip_u);
3078 for (l = 0; l < 2*m+2; l++)
3079 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3081 uo(ths,j,&u,&o,(INT)1);
3082 ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3083 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3084 ip_w = ip_y - (R)(ip_u);
3085 for (l = 0; l < 2*m+2; l++)
3086 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3088 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3098 #pragma omp parallel for default(shared) private(k)
3100 for (k = 0; k < M; k++)
3102 R psij_const[2*(2*m+2)];
3104 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3106 uo(ths,j,&u,&o,(INT)0);
3107 for (l = 0; l <= 2*m+1; l++)
3108 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3110 uo(ths,j,&u,&o,(INT)1);
3111 for (l = 0; l <= 2*m+1; l++)
3112 psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3114 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3119 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3121 assert(ar_x[2*k] >= min_u_a || k == M-1); \
3123 assert(ar_x[2*k-2] < min_u_a); \
3126 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3130 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3132 assert(ar_x[2*k] >= min_u_b || k == M-1); \
3134 assert(ar_x[2*k-2] < min_u_b); \
3137 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3140 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3141 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3142 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3143 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3145 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3147 R psij_const[2*(2*m+2)]; \
3149 R fg_psij0 = ths->psi[2*j*2]; \
3150 R fg_psij1 = ths->psi[2*j*2+1]; \
3151 R fg_psij2 = K(1.0); \
3153 psij_const[0] = fg_psij0; \
3154 for(l=1; l<=2*m+1; l++) \
3156 fg_psij2 *= fg_psij1; \
3157 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3160 fg_psij0 = ths->psi[2*(j*2+1)]; \
3161 fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3162 fg_psij2 = K(1.0); \
3163 psij_const[2*m+2] = fg_psij0; \
3164 for(l=1; l<=2*m+1; l++) \
3166 fg_psij2 *= fg_psij1; \
3167 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3170 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3171 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3172 n0, n1, m, my_u0, my_o0); \
3175 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3177 R psij_const[2*(2*m+2)]; \
3178 R fg_psij0, fg_psij1, fg_psij2; \
3181 uo(ths,j,&u,&o,(INT)0); \
3182 fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \
3183 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3184 fg_psij2 = K(1.0); \
3185 psij_const[0] = fg_psij0; \
3186 for(l=1; l<=2*m+1; l++) \
3188 fg_psij2 *= fg_psij1; \
3189 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3192 uo(ths,j,&u,&o,(INT)1); \
3193 fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \
3194 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3195 fg_psij2 = K(1.0); \
3196 psij_const[2*m+2] = fg_psij0; \
3197 for(l=1; l<=2*m+1; l++) \
3199 fg_psij2 *= fg_psij1; \
3200 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3203 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3204 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3205 n0, n1, m, my_u0, my_o0); \
3208 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3210 R psij_const[2*(2*m+2)]; \
3215 uo(ths,j,&u,&o,(INT)0); \
3216 ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3217 ip_u = LRINT(FLOOR(ip_y)); \
3219 for(l=0; l < 2*m+2; l++) \
3220 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3221 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3223 uo(ths,j,&u,&o,(INT)1); \
3224 ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3225 ip_u = LRINT(FLOOR(ip_y)); \
3227 for(l=0; l < 2*m+2; l++) \
3228 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3229 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3231 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3232 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3233 n0, n1, m, my_u0, my_o0); \
3236 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3238 R psij_const[2*(2*m+2)]; \
3241 uo(ths,j,&u,&o,(INT)0); \
3242 for(l=0;l<=2*m+1;l++) \
3243 psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \
3245 uo(ths,j,&u,&o,(INT)1); \
3246 for(l=0;l<=2*m+1;l++) \
3247 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3249 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3250 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3251 n0, n1, m, my_u0, my_o0); \
3254 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3256 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3258 _Pragma("omp parallel private(k)") \
3260 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3261 INT *ar_x = ths->index_x; \
3263 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3264 &min_u_b, &max_u_b, 2, ths->n, m); \
3266 if (min_u_a != -1) \
3268 k = index_x_binary_search(ar_x, M, min_u_a); \
3270 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3274 INT u_prod = ar_x[2*k]; \
3275 INT j = ar_x[2*k+1]; \
3277 if (u_prod < min_u_a || u_prod > max_u_a) \
3280 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3286 if (min_u_b != -1) \
3288 INT k = index_x_binary_search(ar_x, M, min_u_b); \
3290 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3294 INT u_prod = ar_x[2*k]; \
3295 INT j = ar_x[2*k+1]; \
3297 if (u_prod < min_u_b || u_prod > max_u_b) \
3300 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3311 static void nfft_adjoint_2d_B(
X(plan) *ths)
3313 const INT n0 = ths->n[0];
3314 const INT n1 = ths->n[1];
3315 const INT M = ths->M_total;
3316 const INT m = ths->m;
3320 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
3324 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3325 (INT)2, ths->n, m, ths->flags, ths->index_x);
3332 MACRO_adjoint_2d_B_OMP_BLOCKWISE(
PRE_PSI)
3336 #pragma omp parallel for default(shared) private(k)
3338 for (k = 0; k < M; k++)
3340 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3342 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3344 nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3352 R fg_exp_l[2*(2*m+2)];
3354 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3355 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3363 #pragma omp parallel for default(shared) private(k)
3365 for (k = 0; k < M; k++)
3367 R psij_const[2*(2*m+2)];
3368 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3370 R fg_psij0 = ths->psi[2*j*2];
3371 R fg_psij1 = ths->psi[2*j*2+1];
3372 R fg_psij2 = K(1.0);
3374 psij_const[0] = fg_psij0;
3375 for(l=1; l<=2*m+1; l++)
3377 fg_psij2 *= fg_psij1;
3378 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3381 fg_psij0 = ths->psi[2*(j*2+1)];
3382 fg_psij1 = ths->psi[2*(j*2+1)+1];
3384 psij_const[2*m+2] = fg_psij0;
3385 for(l=1; l<=2*m+1; l++)
3387 fg_psij2 *= fg_psij1;
3388 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3392 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3394 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3403 R fg_exp_l[2*(2*m+2)];
3405 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3406 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3411 MACRO_adjoint_2d_B_OMP_BLOCKWISE(
FG_PSI)
3415 #pragma omp parallel for default(shared) private(k)
3417 for (k = 0; k < M; k++)
3420 R fg_psij0, fg_psij1, fg_psij2;
3421 R psij_const[2*(2*m+2)];
3422 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3424 uo(ths,j,&u,&o,(INT)0);
3425 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3426 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3428 psij_const[0] = fg_psij0;
3429 for(l=1; l<=2*m+1; l++)
3431 fg_psij2 *= fg_psij1;
3432 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3435 uo(ths,j,&u,&o,(INT)1);
3436 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3437 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3439 psij_const[2*m+2] = fg_psij0;
3440 for(l=1; l<=2*m+1; l++)
3442 fg_psij2 *= fg_psij1;
3443 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3447 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3449 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3458 const INT K = ths->K;
3459 const INT ip_s = K / (m + 2);
3468 #pragma omp parallel for default(shared) private(k)
3470 for (k = 0; k < M; k++)
3475 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3476 R psij_const[2*(2*m+2)];
3478 uo(ths,j,&u,&o,(INT)0);
3479 ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3480 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3481 ip_w = ip_y - (R)(ip_u);
3482 for(l=0; l < 2*m+2; l++)
3483 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3484 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3486 uo(ths,j,&u,&o,(INT)1);
3487 ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3488 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3489 ip_w = ip_y - (R)(ip_u);
3490 for(l=0; l < 2*m+2; l++)
3491 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3492 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3495 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3497 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3507 MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3511 #pragma omp parallel for default(shared) private(k)
3513 for (k = 0; k < M; k++)
3516 R psij_const[2*(2*m+2)];
3517 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3519 uo(ths,j,&u,&o,(INT)0);
3520 for(l=0;l<=2*m+1;l++)
3521 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3523 uo(ths,j,&u,&o,(INT)1);
3524 for(l=0;l<=2*m+1;l++)
3525 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3528 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3530 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3536 void X(trafo_2d)(
X(plan) *ths)
3538 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3540 X(trafo_direct)(ths);
3544 INT k0,k1,n0,n1,N0,N1;
3546 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3547 R ck01, ck02, ck11, ck12;
3548 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3558 f_hat=(C*)ths->f_hat;
3559 g_hat=(C*)ths->g_hat;
3563 #pragma omp parallel for default(shared) private(k0)
3564 for (k0 = 0; k0 < ths->n_total; k0++)
3565 ths->g_hat[k0] = 0.0;
3567 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
3569 if(ths->flags & PRE_PHI_HUT)
3571 c_phi_inv01=ths->c_phi_inv[0];
3572 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3575 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3577 for(k0=0;k0<N0/2;k0++)
3579 ck01=c_phi_inv01[k0];
3580 ck02=c_phi_inv02[k0];
3582 c_phi_inv11=ths->c_phi_inv[1];
3583 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3585 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3586 f_hat11=f_hat + k0*N1;
3587 g_hat21=g_hat + k0*n1+n1-(N1/2);
3588 f_hat21=f_hat + ((N0/2)+k0)*N1;
3589 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3590 f_hat12=f_hat + k0*N1+(N1/2);
3591 g_hat22=g_hat + k0*n1;
3592 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3594 for(k1=0;k1<N1/2;k1++)
3596 ck11=c_phi_inv11[k1];
3597 ck12=c_phi_inv12[k1];
3599 g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3600 g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3601 g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3602 g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3608 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3610 for(k0=0;k0<N0/2;k0++)
3612 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3613 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3614 for(k1=0;k1<N1/2;k1++)
3616 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3617 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3618 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3619 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3620 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3621 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3628 FFTW(execute)(ths->my_fftw_plan1);
3632 nfft_trafo_2d_B(ths);
3636 void X(adjoint_2d)(
X(plan) *ths)
3638 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3640 X(adjoint_direct)(ths);
3644 INT k0,k1,n0,n1,N0,N1;
3646 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3647 R ck01, ck02, ck11, ck12;
3648 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3658 f_hat=(C*)ths->f_hat;
3659 g_hat=(C*)ths->g_hat;
3662 nfft_adjoint_2d_B(ths);
3666 FFTW(execute)(ths->my_fftw_plan2);
3670 if(ths->flags & PRE_PHI_HUT)
3672 c_phi_inv01=ths->c_phi_inv[0];
3673 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3676 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3678 for(k0=0;k0<N0/2;k0++)
3680 ck01=c_phi_inv01[k0];
3681 ck02=c_phi_inv02[k0];
3683 c_phi_inv11=ths->c_phi_inv[1];
3684 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3686 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3687 f_hat11=f_hat + k0*N1;
3688 g_hat21=g_hat + k0*n1+n1-(N1/2);
3689 f_hat21=f_hat + ((N0/2)+k0)*N1;
3690 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3691 f_hat12=f_hat + k0*N1+(N1/2);
3692 g_hat22=g_hat + k0*n1;
3693 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3695 for(k1=0;k1<N1/2;k1++)
3697 ck11=c_phi_inv11[k1];
3698 ck12=c_phi_inv12[k1];
3700 f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3701 f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3702 f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3703 f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3709 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3711 for(k0=0;k0<N0/2;k0++)
3713 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3714 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3715 for(k1=0;k1<N1/2;k1++)
3717 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3718 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3719 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3720 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3721 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3722 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3730 static void nfft_3d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
3733 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3735 fg_exp_b0 = EXP(-K(1.0) / b);
3736 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3739 fg_exp_l[0] = K(1.0);
3740 for(l=1; l <= 2*m+1; l++)
3742 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3743 fg_exp_b1 *= fg_exp_b0_sq;
3744 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3748 static void nfft_trafo_3d_compute(C *fj,
const C *g,
const R *psij_const0,
3749 const R *psij_const1,
const R *psij_const2,
const R *xj0,
const R *xj1,
3750 const R *xj2,
const INT n0,
const INT n1,
const INT n2,
const INT m)
3752 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
3754 const R *psij0, *psij1, *psij2;
3756 psij0 = psij_const0;
3757 psij1 = psij_const1;
3758 psij2 = psij_const2;
3760 uo2(&u0, &o0, *xj0, n0, m);
3761 uo2(&u1, &o1, *xj1, n1, m);
3762 uo2(&u2, &o2, *xj2, n2, m);
3769 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3771 psij1 = psij_const1;
3772 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3774 psij2 = psij_const2;
3775 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3776 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3777 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3782 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3784 psij1 = psij_const1;
3785 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3787 psij2 = psij_const2;
3788 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3789 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3790 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3791 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3792 for (l2 = 0; l2 <= o2; l2++)
3793 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3798 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3800 psij1 = psij_const1;
3801 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3803 psij2 = psij_const2;
3804 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3805 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3806 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3808 for (l1 = 0; l1 <= o1; l1++, psij1++)
3810 psij2 = psij_const2;
3811 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3812 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3813 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3818 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3820 psij1 = psij_const1;
3821 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3823 psij2 = psij_const2;
3824 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3825 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3826 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3827 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3828 for (l2 = 0; l2 <= o2; l2++)
3829 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3831 for (l1 = 0; l1 <= o1; l1++, psij1++)
3833 psij2 = psij_const2;
3834 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3835 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3836 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3837 gj = g + ((u0 + l0) * n1 + l1) * n2;
3838 for (l2 = 0; l2 <= o2; l2++)
3839 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3847 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3849 psij1 = psij_const1;
3850 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3852 psij2 = psij_const2;
3853 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3854 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3855 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3859 for (l0 = 0; l0 <= o0; l0++, psij0++)
3861 psij1 = psij_const1;
3862 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3864 psij2 = psij_const2;
3865 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3866 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3867 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3872 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3874 psij1 = psij_const1;
3875 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3877 psij2 = psij_const2;
3878 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3879 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3880 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3881 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3882 for (l2 = 0; l2 <= o2; l2++)
3883 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3887 for (l0 = 0; l0 <= o0; l0++, psij0++)
3889 psij1 = psij_const1;
3890 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3892 psij2 = psij_const2;
3893 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3894 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3895 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3896 gj = g + (l0 * n1 + (u1 + l1)) * n2;
3897 for (l2 = 0; l2 <= o2; l2++)
3898 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3905 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3907 psij1 = psij_const1;
3908 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3910 psij2 = psij_const2;
3911 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3912 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3913 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3915 for (l1 = 0; l1 <= o1; l1++, psij1++)
3917 psij2 = psij_const2;
3918 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3919 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3920 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3923 for (l0 = 0; l0 <= o0; l0++, psij0++)
3925 psij1 = psij_const1;
3926 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3928 psij2 = psij_const2;
3929 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3930 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3931 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3933 for (l1 = 0; l1 <= o1; l1++, psij1++)
3935 psij2 = psij_const2;
3936 gj = g + (l0 * n1 + l1) * n2 + u2;
3937 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3938 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3943 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3945 psij1 = psij_const1;
3946 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3948 psij2 = psij_const2;
3949 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3950 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3951 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3952 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3953 for (l2 = 0; l2 <= o2; l2++)
3954 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3956 for (l1 = 0; l1 <= o1; l1++, psij1++)
3958 psij2 = psij_const2;
3959 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3960 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3961 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3962 gj = g + ((u0 + l0) * n1 + l1) * n2;
3963 for (l2 = 0; l2 <= o2; l2++)
3964 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3968 for (l0 = 0; l0 <= o0; l0++, psij0++)
3970 psij1 = psij_const1;
3971 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3973 psij2 = psij_const2;
3974 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3975 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3976 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3977 gj = g + (l0 * n1 + (u1 + l1)) * n2;
3978 for (l2 = 0; l2 <= o2; l2++)
3979 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3981 for (l1 = 0; l1 <= o1; l1++, psij1++)
3983 psij2 = psij_const2;
3984 gj = g + (l0 * n1 + l1) * n2 + u2;
3985 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3986 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3987 gj = g + (l0 * n1 + l1) * n2;
3988 for (l2 = 0; l2 <= o2; l2++)
3989 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4017 static void nfft_adjoint_3d_compute_omp_blockwise(
const C f, C *g,
4018 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4019 const R *xj0,
const R *xj1,
const R *xj2,
4020 const INT n0,
const INT n1,
const INT n2,
const INT m,
4021 const INT my_u0,
const INT my_o0)
4023 INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4024 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4026 INT index_temp1[2*m+2];
4027 INT index_temp2[2*m+2];
4029 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4030 uo2(&u1,&o1,*xj1, n1, m);
4031 uo2(&u2,&o2,*xj2, n2, m);
4033 for (l1=0; l1<=2*m+1; l1++)
4034 index_temp1[l1] = (u1+l1)%n1;
4036 for (l2=0; l2<=2*m+1; l2++)
4037 index_temp2[l2] = (u2+l2)%n2;
4041 INT u0 = MAX(my_u0,ar_u0);
4042 INT o0 = MIN(my_o0,ar_o0);
4043 INT offset_psij = u0-ar_u0;
4045 assert(offset_psij >= 0);
4046 assert(o0-u0 <= 2*m+1);
4047 assert(offset_psij+o0-u0 <= 2*m+1);
4050 for (l0 = 0; l0 <= o0-u0; l0++)
4052 const INT i0 = (u0+l0) * n1;
4053 const C val0 = psij_const0[offset_psij+l0];
4055 for(l1=0; l1<=2*m+1; l1++)
4057 const INT i1 = (i0 + index_temp1[l1]) * n2;
4058 const C val1 = psij_const1[l1];
4060 for(l2=0; l2<=2*m+1; l2++)
4061 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4067 INT u0 = MAX(my_u0,ar_u0);
4069 INT offset_psij = u0-ar_u0;
4071 assert(offset_psij >= 0);
4072 assert(o0-u0 <= 2*m+1);
4073 assert(offset_psij+o0-u0 <= 2*m+1);
4076 for (l0 = 0; l0 <= o0-u0; l0++)
4078 INT i0 = (u0+l0) * n1;
4079 const C val0 = psij_const0[offset_psij+l0];
4081 for(l1=0; l1<=2*m+1; l1++)
4083 const INT i1 = (i0 + index_temp1[l1]) * n2;
4084 const C val1 = psij_const1[l1];
4086 for(l2=0; l2<=2*m+1; l2++)
4087 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4092 o0 = MIN(my_o0,ar_o0);
4093 offset_psij += my_u0-ar_u0+n0;
4098 assert(o0-u0 <= 2*m+1);
4099 assert(offset_psij+o0-u0 <= 2*m+1);
4102 for (l0 = 0; l0 <= o0-u0; l0++)
4104 INT i0 = (u0+l0) * n1;
4105 const C val0 = psij_const0[offset_psij+l0];
4107 for(l1=0; l1<=2*m+1; l1++)
4109 const INT i1 = (i0 + index_temp1[l1]) * n2;
4110 const C val1 = psij_const1[l1];
4112 for(l2=0; l2<=2*m+1; l2++)
4113 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4122 static void nfft_adjoint_3d_compute_omp_atomic(
const C f, C *g,
4123 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4124 const R *xj0,
const R *xj1,
const R *xj2,
4125 const INT n0,
const INT n1,
const INT n2,
const INT m)
4127 INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4128 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4130 INT index_temp0[2*m+2];
4131 INT index_temp1[2*m+2];
4132 INT index_temp2[2*m+2];
4134 uo2(&u0,&o0,*xj0, n0, m);
4135 uo2(&u1,&o1,*xj1, n1, m);
4136 uo2(&u2,&o2,*xj2, n2, m);
4138 for (l0=0; l0<=2*m+1; l0++)
4139 index_temp0[l0] = (u0+l0)%n0;
4141 for (l1=0; l1<=2*m+1; l1++)
4142 index_temp1[l1] = (u1+l1)%n1;
4144 for (l2=0; l2<=2*m+1; l2++)
4145 index_temp2[l2] = (u2+l2)%n2;
4147 for(l0=0; l0<=2*m+1; l0++)
4149 for(l1=0; l1<=2*m+1; l1++)
4151 for(l2=0; l2<=2*m+1; l2++)
4153 INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4155 R *lhs_real = (R*)lhs;
4156 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4159 lhs_real[0] += CREAL(val);
4162 lhs_real[1] += CIMAG(val);
4170 static void nfft_adjoint_3d_compute_serial(
const C *fj, C *g,
4171 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
const R *xj0,
4172 const R *xj1,
const R *xj2,
const INT n0,
const INT n1,
const INT n2,
4175 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4177 const R *psij0, *psij1, *psij2;
4179 psij0 = psij_const0;
4180 psij1 = psij_const1;
4181 psij2 = psij_const2;
4183 uo2(&u0, &o0, *xj0, n0, m);
4184 uo2(&u1, &o1, *xj1, n1, m);
4185 uo2(&u2, &o2, *xj2, n2, m);
4190 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4192 psij1 = psij_const1;
4193 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4195 psij2 = psij_const2;
4196 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4197 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4198 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4203 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4205 psij1 = psij_const1;
4206 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4208 psij2 = psij_const2;
4209 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4210 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4211 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4212 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4213 for (l2 = 0; l2 <= o2; l2++)
4214 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4219 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4221 psij1 = psij_const1;
4222 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4224 psij2 = psij_const2;
4225 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4226 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4227 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4229 for (l1 = 0; l1 <= o1; l1++, psij1++)
4231 psij2 = psij_const2;
4232 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4233 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4234 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4239 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4241 psij1 = psij_const1;
4242 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4244 psij2 = psij_const2;
4245 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4246 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4247 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4248 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4249 for (l2 = 0; l2 <= o2; l2++)
4250 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4252 for (l1 = 0; l1 <= o1; l1++, psij1++)
4254 psij2 = psij_const2;
4255 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4256 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4257 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4258 gj = g + ((u0 + l0) * n1 + l1) * n2;
4259 for (l2 = 0; l2 <= o2; l2++)
4260 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4268 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4270 psij1 = psij_const1;
4271 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4273 psij2 = psij_const2;
4274 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4275 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4276 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4280 for (l0 = 0; l0 <= o0; l0++, psij0++)
4282 psij1 = psij_const1;
4283 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4285 psij2 = psij_const2;
4286 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4287 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4288 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4293 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4295 psij1 = psij_const1;
4296 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4298 psij2 = psij_const2;
4299 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4300 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4301 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4302 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4303 for (l2 = 0; l2 <= o2; l2++)
4304 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4308 for (l0 = 0; l0 <= o0; l0++, psij0++)
4310 psij1 = psij_const1;
4311 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4313 psij2 = psij_const2;
4314 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4315 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4316 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4317 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4318 for (l2 = 0; l2 <= o2; l2++)
4319 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4326 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4328 psij1 = psij_const1;
4329 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4331 psij2 = psij_const2;
4332 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4333 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4334 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4336 for (l1 = 0; l1 <= o1; l1++, psij1++)
4338 psij2 = psij_const2;
4339 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4340 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4341 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4344 for (l0 = 0; l0 <= o0; l0++, psij0++)
4346 psij1 = psij_const1;
4347 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4349 psij2 = psij_const2;
4350 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4351 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4352 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4354 for (l1 = 0; l1 <= o1; l1++, psij1++)
4356 psij2 = psij_const2;
4357 gj = g + (l0 * n1 + l1) * n2 + u2;
4358 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4359 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4364 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4366 psij1 = psij_const1;
4367 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4369 psij2 = psij_const2;
4370 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4371 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4372 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4373 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4374 for (l2 = 0; l2 <= o2; l2++)
4375 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4377 for (l1 = 0; l1 <= o1; l1++, psij1++)
4379 psij2 = psij_const2;
4380 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4381 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4382 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4383 gj = g + ((u0 + l0) * n1 + l1) * n2;
4384 for (l2 = 0; l2 <= o2; l2++)
4385 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4389 for (l0 = 0; l0 <= o0; l0++, psij0++)
4391 psij1 = psij_const1;
4392 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4394 psij2 = psij_const2;
4395 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4396 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4397 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4398 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4399 for (l2 = 0; l2 <= o2; l2++)
4400 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4402 for (l1 = 0; l1 <= o1; l1++, psij1++)
4404 psij2 = psij_const2;
4405 gj = g + (l0 * n1 + l1) * n2 + u2;
4406 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4407 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4408 gj = g + (l0 * n1 + l1) * n2;
4409 for (l2 = 0; l2 <= o2; l2++)
4410 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4417 static void nfft_trafo_3d_B(
X(plan) *ths)
4419 const INT n0 = ths->n[0];
4420 const INT n1 = ths->n[1];
4421 const INT n2 = ths->n[2];
4422 const INT M = ths->M_total;
4423 const INT m = ths->m;
4425 const C* g = (C*) ths->g;
4431 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4433 #pragma omp parallel for default(shared) private(k)
4435 for (k = 0; k < M; k++)
4438 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4440 for (l = 0; l < lprod; l++)
4441 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4449 #pragma omp parallel for default(shared) private(k)
4451 for (k = 0; k < M; k++)
4453 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4454 nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4461 R fg_exp_l[3*(2*m+2)];
4463 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4464 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4465 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4468 #pragma omp parallel for default(shared) private(k)
4470 for (k = 0; k < M; k++)
4472 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4474 R psij_const[3*(2*m+2)];
4475 R fg_psij0 = ths->psi[2*j*3];
4476 R fg_psij1 = ths->psi[2*j*3+1];
4477 R fg_psij2 = K(1.0);
4479 psij_const[0] = fg_psij0;
4480 for(l=1; l<=2*m+1; l++)
4482 fg_psij2 *= fg_psij1;
4483 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4486 fg_psij0 = ths->psi[2*(j*3+1)];
4487 fg_psij1 = ths->psi[2*(j*3+1)+1];
4489 psij_const[2*m+2] = fg_psij0;
4490 for(l=1; l<=2*m+1; l++)
4492 fg_psij2 *= fg_psij1;
4493 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4496 fg_psij0 = ths->psi[2*(j*3+2)];
4497 fg_psij1 = ths->psi[2*(j*3+2)+1];
4499 psij_const[2*(2*m+2)] = fg_psij0;
4500 for(l=1; l<=2*m+1; l++)
4502 fg_psij2 *= fg_psij1;
4503 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4506 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4514 R fg_exp_l[3*(2*m+2)];
4516 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4517 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4518 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4523 #pragma omp parallel for default(shared) private(k)
4525 for (k = 0; k < M; k++)
4527 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4529 R psij_const[3*(2*m+2)];
4530 R fg_psij0, fg_psij1, fg_psij2;
4532 uo(ths,j,&u,&o,(INT)0);
4533 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4534 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4536 psij_const[0] = fg_psij0;
4537 for(l=1; l<=2*m+1; l++)
4539 fg_psij2 *= fg_psij1;
4540 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4543 uo(ths,j,&u,&o,(INT)1);
4544 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4545 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4547 psij_const[2*m+2] = fg_psij0;
4548 for(l=1; l<=2*m+1; l++)
4550 fg_psij2 *= fg_psij1;
4551 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4554 uo(ths,j,&u,&o,(INT)2);
4555 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4556 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4558 psij_const[2*(2*m+2)] = fg_psij0;
4559 for(l=1; l<=2*m+1; l++)
4561 fg_psij2 *= fg_psij1;
4562 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4565 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4573 const INT K = ths->K, ip_s = K / (m + 2);
4578 #pragma omp parallel for default(shared) private(k)
4580 for (k = 0; k < M; k++)
4585 R psij_const[3*(2*m+2)];
4586 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4588 uo(ths,j,&u,&o,(INT)0);
4589 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4590 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4591 ip_w = ip_y - (R)(ip_u);
4592 for(l=0; l < 2*m+2; l++)
4593 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4594 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4596 uo(ths,j,&u,&o,(INT)1);
4597 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4598 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4599 ip_w = ip_y - (R)(ip_u);
4600 for(l=0; l < 2*m+2; l++)
4601 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4602 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4604 uo(ths,j,&u,&o,(INT)2);
4605 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4606 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4607 ip_w = ip_y - (R)(ip_u);
4608 for(l=0; l < 2*m+2; l++)
4609 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4610 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4612 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4622 #pragma omp parallel for default(shared) private(k)
4624 for (k = 0; k < M; k++)
4626 R psij_const[3*(2*m+2)];
4628 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4630 uo(ths,j,&u,&o,(INT)0);
4631 for(l=0;l<=2*m+1;l++)
4632 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4634 uo(ths,j,&u,&o,(INT)1);
4635 for(l=0;l<=2*m+1;l++)
4636 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4638 uo(ths,j,&u,&o,(INT)2);
4639 for(l=0;l<=2*m+1;l++)
4640 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4642 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4647 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4649 assert(ar_x[2*k] >= min_u_a || k == M-1); \
4651 assert(ar_x[2*k-2] < min_u_a); \
4654 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4658 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4660 assert(ar_x[2*k] >= min_u_b || k == M-1); \
4662 assert(ar_x[2*k-2] < min_u_b); \
4665 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4668 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4669 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4670 ths->psi+j*3*(2*m+2), \
4671 ths->psi+(j*3+1)*(2*m+2), \
4672 ths->psi+(j*3+2)*(2*m+2), \
4673 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4674 n0, n1, n2, m, my_u0, my_o0);
4676 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4679 R psij_const[3*(2*m+2)]; \
4680 R fg_psij0 = ths->psi[2*j*3]; \
4681 R fg_psij1 = ths->psi[2*j*3+1]; \
4682 R fg_psij2 = K(1.0); \
4684 psij_const[0] = fg_psij0; \
4685 for(l=1; l<=2*m+1; l++) \
4687 fg_psij2 *= fg_psij1; \
4688 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4691 fg_psij0 = ths->psi[2*(j*3+1)]; \
4692 fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4693 fg_psij2 = K(1.0); \
4694 psij_const[2*m+2] = fg_psij0; \
4695 for(l=1; l<=2*m+1; l++) \
4697 fg_psij2 *= fg_psij1; \
4698 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4701 fg_psij0 = ths->psi[2*(j*3+2)]; \
4702 fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4703 fg_psij2 = K(1.0); \
4704 psij_const[2*(2*m+2)] = fg_psij0; \
4705 for(l=1; l<=2*m+1; l++) \
4707 fg_psij2 *= fg_psij1; \
4708 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4711 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4712 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4713 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4714 n0, n1, n2, m, my_u0, my_o0); \
4717 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4720 R psij_const[3*(2*m+2)]; \
4721 R fg_psij0, fg_psij1, fg_psij2; \
4723 uo(ths,j,&u,&o,(INT)0); \
4724 fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \
4725 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4726 fg_psij2 = K(1.0); \
4727 psij_const[0] = fg_psij0; \
4728 for(l=1; l<=2*m+1; l++) \
4730 fg_psij2 *= fg_psij1; \
4731 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4734 uo(ths,j,&u,&o,(INT)1); \
4735 fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \
4736 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4737 fg_psij2 = K(1.0); \
4738 psij_const[2*m+2] = fg_psij0; \
4739 for(l=1; l<=2*m+1; l++) \
4741 fg_psij2 *= fg_psij1; \
4742 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4745 uo(ths,j,&u,&o,(INT)2); \
4746 fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \
4747 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4748 fg_psij2 = K(1.0); \
4749 psij_const[2*(2*m+2)] = fg_psij0; \
4750 for(l=1; l<=2*m+1; l++) \
4752 fg_psij2 *= fg_psij1; \
4753 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4756 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4757 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4758 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4759 n0, n1, n2, m, my_u0, my_o0); \
4762 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4765 R psij_const[3*(2*m+2)]; \
4769 uo(ths,j,&u,&o,(INT)0); \
4770 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4771 ip_u = LRINT(FLOOR(ip_y)); \
4773 for(l=0; l < 2*m+2; l++) \
4774 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4775 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4777 uo(ths,j,&u,&o,(INT)1); \
4778 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4779 ip_u = LRINT(FLOOR(ip_y)); \
4781 for(l=0; l < 2*m+2; l++) \
4782 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4783 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4785 uo(ths,j,&u,&o,(INT)2); \
4786 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4787 ip_u = LRINT(FLOOR(ip_y)); \
4789 for(l=0; l < 2*m+2; l++) \
4790 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4791 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4793 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4794 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4795 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4796 n0, n1, n2, m, my_u0, my_o0); \
4799 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4802 R psij_const[3*(2*m+2)]; \
4804 uo(ths,j,&u,&o,(INT)0); \
4805 for(l=0;l<=2*m+1;l++) \
4806 psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \
4808 uo(ths,j,&u,&o,(INT)1); \
4809 for(l=0;l<=2*m+1;l++) \
4810 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4812 uo(ths,j,&u,&o,(INT)2); \
4813 for(l=0;l<=2*m+1;l++) \
4814 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4816 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4817 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4818 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4819 n0, n1, n2, m, my_u0, my_o0); \
4822 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4824 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4826 _Pragma("omp parallel private(k)") \
4828 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4829 INT *ar_x = ths->index_x; \
4831 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4832 &min_u_b, &max_u_b, 3, ths->n, m); \
4834 if (min_u_a != -1) \
4836 k = index_x_binary_search(ar_x, M, min_u_a); \
4838 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4842 INT u_prod = ar_x[2*k]; \
4843 INT j = ar_x[2*k+1]; \
4845 if (u_prod < min_u_a || u_prod > max_u_a) \
4848 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4854 if (min_u_b != -1) \
4856 INT k = index_x_binary_search(ar_x, M, min_u_b); \
4858 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4862 INT u_prod = ar_x[2*k]; \
4863 INT j = ar_x[2*k+1]; \
4865 if (u_prod < min_u_b || u_prod > max_u_b) \
4868 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4878 static void nfft_adjoint_3d_B(
X(plan) *ths)
4881 const INT n0 = ths->n[0];
4882 const INT n1 = ths->n[1];
4883 const INT n2 = ths->n[2];
4884 const INT M = ths->M_total;
4885 const INT m = ths->m;
4889 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
4893 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
4894 (INT)3, ths->n, m, ths->flags, ths->index_x);
4901 MACRO_adjoint_3d_B_OMP_BLOCKWISE(
PRE_PSI)
4905 #pragma omp parallel for default(shared) private(k)
4907 for (k = 0; k < M; k++)
4909 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4911 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4913 nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4921 R fg_exp_l[3*(2*m+2)];
4923 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4924 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4925 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4932 #pragma omp parallel for default(shared) private(k)
4934 for (k = 0; k < M; k++)
4936 R psij_const[3*(2*m+2)];
4937 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4939 R fg_psij0 = ths->psi[2*j*3];
4940 R fg_psij1 = ths->psi[2*j*3+1];
4941 R fg_psij2 = K(1.0);
4943 psij_const[0] = fg_psij0;
4944 for(l=1; l<=2*m+1; l++)
4946 fg_psij2 *= fg_psij1;
4947 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4950 fg_psij0 = ths->psi[2*(j*3+1)];
4951 fg_psij1 = ths->psi[2*(j*3+1)+1];
4953 psij_const[2*m+2] = fg_psij0;
4954 for(l=1; l<=2*m+1; l++)
4956 fg_psij2 *= fg_psij1;
4957 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4960 fg_psij0 = ths->psi[2*(j*3+2)];
4961 fg_psij1 = ths->psi[2*(j*3+2)+1];
4963 psij_const[2*(2*m+2)] = fg_psij0;
4964 for(l=1; l<=2*m+1; l++)
4966 fg_psij2 *= fg_psij1;
4967 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4971 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4973 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4982 R fg_exp_l[3*(2*m+2)];
4984 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4985 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4986 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4991 MACRO_adjoint_3d_B_OMP_BLOCKWISE(
FG_PSI)
4995 #pragma omp parallel for default(shared) private(k)
4997 for (k = 0; k < M; k++)
5000 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5001 R psij_const[3*(2*m+2)];
5002 R fg_psij0, fg_psij1, fg_psij2;
5004 uo(ths,j,&u,&o,(INT)0);
5005 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
5006 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
5008 psij_const[0] = fg_psij0;
5009 for(l=1; l<=2*m+1; l++)
5011 fg_psij2 *= fg_psij1;
5012 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5015 uo(ths,j,&u,&o,(INT)1);
5016 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
5017 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
5019 psij_const[2*m+2] = fg_psij0;
5020 for(l=1; l<=2*m+1; l++)
5022 fg_psij2 *= fg_psij1;
5023 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5026 uo(ths,j,&u,&o,(INT)2);
5027 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5028 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5030 psij_const[2*(2*m+2)] = fg_psij0;
5031 for(l=1; l<=2*m+1; l++)
5033 fg_psij2 *= fg_psij1;
5034 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5038 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5040 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5049 const INT K = ths->K;
5050 const INT ip_s = K / (m + 2);
5059 #pragma omp parallel for default(shared) private(k)
5061 for (k = 0; k < M; k++)
5066 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5067 R psij_const[3*(2*m+2)];
5069 uo(ths,j,&u,&o,(INT)0);
5070 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5071 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5072 ip_w = ip_y - (R)(ip_u);
5073 for(l=0; l < 2*m+2; l++)
5074 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5075 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5077 uo(ths,j,&u,&o,(INT)1);
5078 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5079 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5080 ip_w = ip_y - (R)(ip_u);
5081 for(l=0; l < 2*m+2; l++)
5082 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5083 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5085 uo(ths,j,&u,&o,(INT)2);
5086 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5087 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5088 ip_w = ip_y - (R)(ip_u);
5089 for(l=0; l < 2*m+2; l++)
5090 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5091 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5094 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5096 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5106 MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5110 #pragma omp parallel for default(shared) private(k)
5112 for (k = 0; k < M; k++)
5115 R psij_const[3*(2*m+2)];
5116 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5118 uo(ths,j,&u,&o,(INT)0);
5119 for(l=0;l<=2*m+1;l++)
5120 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5122 uo(ths,j,&u,&o,(INT)1);
5123 for(l=0;l<=2*m+1;l++)
5124 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5126 uo(ths,j,&u,&o,(INT)2);
5127 for(l=0;l<=2*m+1;l++)
5128 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5131 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5133 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5139 void X(trafo_3d)(
X(plan) *ths)
5141 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5143 X(trafo_direct)(ths);
5147 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5149 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5150 R ck01, ck02, ck11, ck12, ck21, ck22;
5151 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5152 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5164 f_hat=(C*)ths->f_hat;
5165 g_hat=(C*)ths->g_hat;
5169 #pragma omp parallel for default(shared) private(k0)
5170 for (k0 = 0; k0 < ths->n_total; k0++)
5171 ths->g_hat[k0] = 0.0;
5173 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
5176 if(ths->flags & PRE_PHI_HUT)
5178 c_phi_inv01=ths->c_phi_inv[0];
5179 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5182 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5184 for(k0=0;k0<N0/2;k0++)
5186 ck01=c_phi_inv01[k0];
5187 ck02=c_phi_inv02[k0];
5188 c_phi_inv11=ths->c_phi_inv[1];
5189 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5191 for(k1=0;k1<N1/2;k1++)
5193 ck11=c_phi_inv11[k1];
5194 ck12=c_phi_inv12[k1];
5195 c_phi_inv21=ths->c_phi_inv[2];
5196 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5198 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5199 f_hat111=f_hat + (k0*N1+k1)*N2;
5200 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5201 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5202 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5203 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5204 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5205 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5207 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5208 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5209 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5210 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5211 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5212 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5213 g_hat222=g_hat + (k0*n1+k1)*n2;
5214 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5216 for(k2=0;k2<N2/2;k2++)
5218 ck21=c_phi_inv21[k2];
5219 ck22=c_phi_inv22[k2];
5221 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5222 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5223 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5224 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5226 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5227 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5228 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5229 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5236 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5238 for(k0=0;k0<N0/2;k0++)
5240 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5241 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5242 for(k1=0;k1<N1/2;k1++)
5244 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5245 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5247 for(k2=0;k2<N2/2;k2++)
5249 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5250 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5252 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5253 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5254 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5255 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5257 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5258 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5259 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5260 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5268 FFTW(execute)(ths->my_fftw_plan1);
5272 nfft_trafo_3d_B(ths);
5276 void X(adjoint_3d)(
X(plan) *ths)
5278 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5280 X(adjoint_direct)(ths);
5284 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5286 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5287 R ck01, ck02, ck11, ck12, ck21, ck22;
5288 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5289 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5301 f_hat=(C*)ths->f_hat;
5302 g_hat=(C*)ths->g_hat;
5305 nfft_adjoint_3d_B(ths);
5309 FFTW(execute)(ths->my_fftw_plan2);
5313 if(ths->flags & PRE_PHI_HUT)
5315 c_phi_inv01=ths->c_phi_inv[0];
5316 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5319 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5321 for(k0=0;k0<N0/2;k0++)
5323 ck01=c_phi_inv01[k0];
5324 ck02=c_phi_inv02[k0];
5325 c_phi_inv11=ths->c_phi_inv[1];
5326 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5328 for(k1=0;k1<N1/2;k1++)
5330 ck11=c_phi_inv11[k1];
5331 ck12=c_phi_inv12[k1];
5332 c_phi_inv21=ths->c_phi_inv[2];
5333 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5335 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5336 f_hat111=f_hat + (k0*N1+k1)*N2;
5337 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5338 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5339 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5340 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5341 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5342 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5344 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5345 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5346 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5347 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5348 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5349 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5350 g_hat222=g_hat + (k0*n1+k1)*n2;
5351 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5353 for(k2=0;k2<N2/2;k2++)
5355 ck21=c_phi_inv21[k2];
5356 ck22=c_phi_inv22[k2];
5358 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5359 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5360 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5361 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5363 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5364 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5365 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5366 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5373 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5375 for(k0=0;k0<N0/2;k0++)
5377 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5378 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5379 for(k1=0;k1<N1/2;k1++)
5381 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5382 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5384 for(k2=0;k2<N2/2;k2++)
5386 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5387 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5389 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5390 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5391 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5392 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5394 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5395 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5396 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5397 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5407 void X(trafo)(
X(plan) *ths)
5410 for (
int j = 0; j < ths->d; j++)
5412 if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5414 X(trafo_direct)(ths);
5421 case 1:
X(trafo_1d)(ths);
break;
5422 case 2:
X(trafo_2d)(ths);
break;
5423 case 3:
X(trafo_3d)(ths);
break;
5427 ths->g_hat = ths->g1;
5442 FFTW(execute)(ths->my_fftw_plan1);
5455 void X(adjoint)(
X(plan) *ths)
5458 for (
int j = 0; j < ths->d; j++)
5460 if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5462 X(adjoint_direct)(ths);
5469 case 1:
X(adjoint_1d)(ths);
break;
5470 case 2:
X(adjoint_2d)(ths);
break;
5471 case 3:
X(adjoint_3d)(ths);
break;
5490 FFTW(execute)(ths->my_fftw_plan2);
5506 static
void precompute_phi_hut(
X(plan) *ths)
5511 ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) *
sizeof(R*));
5513 for (t = 0; t < ths->d; t++)
5515 ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) *
sizeof(R));
5517 for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5519 ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5528 void X(precompute_lin_psi)(
X(plan) *ths)
5534 for (t=0; t<ths->d; t++)
5536 step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5537 for(j = 0;j <= ths->K; j++)
5539 ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5544 void X(precompute_fg_psi)(
X(plan) *ths)
5551 for (t=0; t<ths->d; t++)
5555 #pragma omp parallel for default(shared) private(j,u,o)
5557 for (j = 0; j < ths->M_total; j++)
5561 ths->psi[2*(j*ths->d+t)]=
5562 (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5564 ths->psi[2*(j*ths->d+t)+1]=
5565 EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5571 void X(precompute_psi)(
X(plan) *ths)
5580 for (t=0; t<ths->d; t++)
5584 #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5586 for (j = 0; j < ths->M_total; j++)
5590 for(l = u, lj = 0; l <= o; l++, lj++)
5591 ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5592 (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5599 static void nfft_precompute_full_psi_omp(
X(plan) *ths)
5606 for(t=0,lprod = 1; t<ths->d; t++)
5607 lprod *= 2*ths->m+2;
5610 #pragma omp parallel for default(shared) private(j)
5611 for(j=0; j<ths->M_total; j++)
5617 INT ll_plain[ths->d+1];
5619 INT u[ths->d], o[ths->d];
5621 R phi_prod[ths->d+1];
5627 MACRO_init_uo_l_lj_t;
5629 for(l_L=0; l_L<lprod; l_L++, ix++)
5631 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5633 ths->psi_index_g[ix]=ll_plain[ths->d];
5634 ths->psi[ix]=phi_prod[ths->d];
5636 MACRO_count_uo_l_lj_t;
5639 ths->psi_index_f[j]=lprod;
5644 void X(precompute_full_psi)(
X(plan) *ths)
5649 nfft_precompute_full_psi_omp(ths);
5656 INT ll_plain[ths->d+1];
5658 INT u[ths->d], o[ths->d];
5660 R phi_prod[ths->d+1];
5666 phi_prod[0] = K(1.0);
5669 for (t = 0, lprod = 1; t < ths->d; t++)
5670 lprod *= 2 * ths->m + 2;
5672 for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5674 MACRO_init_uo_l_lj_t;
5676 for (l_L = 0; l_L < lprod; l_L++, ix++)
5678 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5680 ths->psi_index_g[ix] = ll_plain[ths->d];
5681 ths->psi[ix] = phi_prod[ths->d];
5683 MACRO_count_uo_l_lj_t;
5686 ths->psi_index_f[j] = ix - ix_old;
5692 void X(precompute_one_psi)(
X(plan) *ths)
5695 X(precompute_lin_psi)(ths);
5697 X(precompute_fg_psi)(ths);
5699 X(precompute_psi)(ths);
5701 X(precompute_full_psi)(ths);
5704 static void init_help(
X(plan) *ths)
5709 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5710 ths->flags |= NFFT_SORT_NODES;
5712 ths->N_total = intprod(ths->N, 0, ths->d);
5713 ths->n_total = intprod(ths->n, 0, ths->d);
5715 ths->sigma = (R*) Y(malloc)((size_t)(ths->d) *
sizeof(R));
5717 for(t = 0;t < ths->d; t++)
5718 ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5723 ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) *
sizeof(R));
5726 ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) *
sizeof(C));
5729 ths->f = (C*)Y(malloc)((size_t)(ths->M_total) *
sizeof(C));
5731 if(ths->flags & PRE_PHI_HUT)
5732 precompute_phi_hut(ths);
5738 ths->K = Y(m2K)(ths->m);
5740 ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) *
sizeof(R));
5744 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) *
sizeof(R));
5747 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) *
sizeof(R));
5751 for (t = 0, lprod = 1; t < ths->d; t++)
5752 lprod *= 2 * ths->m + 2;
5754 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(R));
5756 ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) *
sizeof(INT));
5757 ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(INT));
5763 INT nthreads = Y(get_num_threads)();
5766 ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
5769 ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
5774 #pragma omp critical (nfft_omp_critical_fftw_plan)
5776 FFTW(plan_with_nthreads)(nthreads);
5779 int *_n = Y(malloc)((size_t)(ths->d) *
sizeof(int));
5781 for (t = 0; t < ths->d; t++)
5782 _n[t] = (
int)(ths->n[t]);
5784 ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
5785 ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
5793 if(ths->flags & NFFT_SORT_NODES)
5794 ths->index_x = (INT*) Y(malloc)(
sizeof(INT) * 2U * (
size_t)(ths->M_total));
5796 ths->index_x = NULL;
5798 ths->mv_trafo = (void (*) (
void* ))
X(trafo);
5799 ths->mv_adjoint = (void (*) (
void* ))
X(adjoint);
5802 void X(init)(
X(plan) *ths,
int d,
int *N,
int M_total)
5808 ths->N = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
5810 for (t = 0; t < d; t++)
5811 ths->N[t] = (INT)N[t];
5813 ths->M_total = (INT)M_total;
5815 ths->n = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
5817 for (t = 0; t < d; t++)
5818 ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
5820 ths->m = WINDOW_HELP_ESTIMATE_m;
5827 NFFT_OMP_BLOCKWISE_ADJOINT;
5837 ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5843 void X(init_guru)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
5844 unsigned flags,
unsigned fftw_flags)
5849 ths->M_total = (INT)M_total;
5850 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5852 for (t = 0; t < d; t++)
5853 ths->N[t] = (INT)N[t];
5855 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5857 for (t = 0; t < d; t++)
5858 ths->n[t] = (INT)n[t];
5863 ths->fftw_flags = fftw_flags;
5869 void X(init_lin)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
int K,
5870 unsigned flags,
unsigned fftw_flags)
5875 ths->M_total = (INT)M_total;
5876 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5878 for (t = 0; t < d; t++)
5879 ths->N[t] = (INT)N[t];
5881 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5883 for (t = 0; t < d; t++)
5884 ths->n[t] = (INT)n[t];
5889 ths->fftw_flags = fftw_flags;
5895 void X(init_1d)(
X(plan) *ths,
int N1,
int M_total)
5901 X(init)(ths, 1, N, M_total);
5904 void X(init_2d)(
X(plan) *ths,
int N1,
int N2,
int M_total)
5910 X(init)(ths, 2, N, M_total);
5913 void X(init_3d)(
X(plan) *ths,
int N1,
int N2,
int N3,
int M_total)
5920 X(init)(ths, 3, N, M_total);
5923 const char*
X(check)(
X(plan) *ths)
5928 return "Member f not initialized.";
5931 return "Member x not initialized.";
5934 return "Member f_hat not initialized.";
5936 if ((ths->flags &
PRE_LIN_PSI) && ths->K < ths->M_total)
5937 return "Number of nodes too small to use PRE_LIN_PSI.";
5939 for (j = 0; j < ths->M_total * ths->d; j++)
5941 if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
5943 return "ths->x out of range [-0.5,0.5)";
5947 for (j = 0; j < ths->d; j++)
5949 if (ths->sigma[j] <= 1)
5950 return "Oversampling factor too small";
5957 if(ths->N[j]%2 == 1)
5958 return "polynomial degree N has to be even";
5963 void X(finalize)(
X(plan) *ths)
5967 if(ths->flags & NFFT_SORT_NODES)
5968 Y(free)(ths->index_x);
5973 #pragma omp critical (nfft_omp_critical_fftw_plan)
5975 FFTW(destroy_plan)(ths->my_fftw_plan2);
5977 #pragma omp critical (nfft_omp_critical_fftw_plan)
5979 FFTW(destroy_plan)(ths->my_fftw_plan1);
5989 Y(free)(ths->psi_index_g);
5990 Y(free)(ths->psi_index_f);
6003 if(ths->flags & PRE_PHI_HUT)
6005 for (t = 0; t < ths->d; t++)
6006 Y(free)(ths->c_phi_inv[t]);
6007 Y(free)(ths->c_phi_inv);
6014 Y(free)(ths->f_hat);
6019 WINDOW_HELP_FINALIZE;
6021 Y(free)(ths->sigma);
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
#define X(name)
Include header for C99 complex datatype.
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Header file for the nfft3 library.