NFFT  3.4.1
nfft.c
1 /*
2  * Copyright (c) 2002, 2017 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 /* Nonequispaced FFT */
20 
21 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
22 
23 /* configure header */
24 #include "config.h"
25 
26 /* complex datatype (maybe) */
27 #ifdef HAVE_COMPLEX_H
28 #include<complex.h>
29 #endif
30 
31 /* NFFT headers */
32 #include "nfft3.h"
33 #include "infft.h"
34 
35 #ifdef _OPENMP
36 #include <omp.h>
37 #endif
38 
39 #ifdef OMP_ASSERT
40 #include <assert.h>
41 #endif
42 
43 #undef X
44 #define X(name) NFFT(name)
45 
47 static inline INT intprod(const INT *vec, const INT a, const INT d)
48 {
49  INT t, p;
50 
51  p = 1;
52  for (t = 0; t < d; t++)
53  p *= vec[t] - a;
54 
55  return p;
56 }
57 
58 /* handy shortcuts */
59 #define BASE(x) CEXP(x)
60 
75 static inline void sort0(const INT d, const INT *n, const INT m,
76  const INT local_x_num, const R *local_x, INT *ar_x)
77 {
78  INT u_j[d], i, j, help, rhigh;
79  INT *ar_x_temp;
80  INT nprod;
81 
82  for (i = 0; i < local_x_num; i++)
83  {
84  ar_x[2 * i] = 0;
85  ar_x[2 *i + 1] = i;
86  for (j = 0; j < d; j++)
87  {
88  help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
89  u_j[j] = (help % n[j] + n[j]) % n[j];
90 
91  ar_x[2 * i] += u_j[j];
92  if (j + 1 < d)
93  ar_x[2 * i] *= n[j + 1];
94  }
95  }
96 
97  for (j = 0, nprod = 1; j < d; j++)
98  nprod *= n[j];
99 
100  rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
101 
102  ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) * sizeof(INT));
103  Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
104 #ifdef OMP_ASSERT
105  for (i = 1; i < local_x_num; i++)
106  assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
107 #endif
108  Y(free)(ar_x_temp);
109 }
110 
119 static inline void sort(const X(plan) *ths)
120 {
121  if (ths->flags & NFFT_SORT_NODES)
122  sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
123 }
124 
145 void X(trafo_direct)(const X(plan) *ths)
146 {
147  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
148 
149  memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
150 
151  if (ths->d == 1)
152  {
153  /* specialize for univariate case, rationale: faster */
154  INT j;
155 #ifdef _OPENMP
156  #pragma omp parallel for default(shared) private(j)
157 #endif
158  for (j = 0; j < ths->M_total; j++)
159  {
160  INT k_L;
161  for (k_L = 0; k_L < ths->N_total; k_L++)
162  {
163  R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
164  f[j] += f_hat[k_L] * BASE(-II * omega);
165  }
166  }
167  }
168  else
169  {
170  /* multivariate case */
171  INT j;
172 #ifdef _OPENMP
173  #pragma omp parallel for default(shared) private(j)
174 #endif
175  for (j = 0; j < ths->M_total; j++)
176  {
177  R x[ths->d], omega, Omega[ths->d + 1];
178  INT t, t2, k_L, k[ths->d];
179  Omega[0] = K(0.0);
180  for (t = 0; t < ths->d; t++)
181  {
182  k[t] = -ths->N[t]/2;
183  x[t] = K2PI * ths->x[j * ths->d + t];
184  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
185  }
186  omega = Omega[ths->d];
187 
188  for (k_L = 0; k_L < ths->N_total; k_L++)
189  {
190  f[j] += f_hat[k_L] * BASE(-II * omega);
191  {
192  for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
193  k[t]-= ths->N[t]-1;
194 
195  k[t]++;
196 
197  for (t2 = t; t2 < ths->d; t2++)
198  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
199 
200  omega = Omega[ths->d];
201  }
202  }
203  }
204  }
205 }
206 
207 void X(adjoint_direct)(const X(plan) *ths)
208 {
209  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
210 
211  memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
212 
213  if (ths->d == 1)
214  {
215  /* specialize for univariate case, rationale: faster */
216 #ifdef _OPENMP
217  INT k_L;
218  #pragma omp parallel for default(shared) private(k_L)
219  for (k_L = 0; k_L < ths->N_total; k_L++)
220  {
221  INT j;
222  for (j = 0; j < ths->M_total; j++)
223  {
224  R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
225  f_hat[k_L] += f[j] * BASE(II * omega);
226  }
227  }
228 #else
229  INT j;
230  for (j = 0; j < ths->M_total; j++)
231  {
232  INT k_L;
233  for (k_L = 0; k_L < ths->N_total; k_L++)
234  {
235  R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
236  f_hat[k_L] += f[j] * BASE(II * omega);
237  }
238  }
239 #endif
240  }
241  else
242  {
243  /* multivariate case */
244  INT j, k_L;
245 #ifdef _OPENMP
246  #pragma omp parallel for default(shared) private(j, k_L)
247  for (k_L = 0; k_L < ths->N_total; k_L++)
248  {
249  INT k[ths->d], k_temp, t;
250 
251  k_temp = k_L;
252 
253  for (t = ths->d - 1; t >= 0; t--)
254  {
255  k[t] = k_temp % ths->N[t] - ths->N[t]/2;
256  k_temp /= ths->N[t];
257  }
258 
259  for (j = 0; j < ths->M_total; j++)
260  {
261  R omega = K(0.0);
262  for (t = 0; t < ths->d; t++)
263  omega += k[t] * K2PI * ths->x[j * ths->d + t];
264  f_hat[k_L] += f[j] * BASE(II * omega);
265  }
266  }
267 #else
268  for (j = 0; j < ths->M_total; j++)
269  {
270  R x[ths->d], omega, Omega[ths->d+1];
271  INT t, t2, k[ths->d];
272  Omega[0] = K(0.0);
273  for (t = 0; t < ths->d; t++)
274  {
275  k[t] = -ths->N[t]/2;
276  x[t] = K2PI * ths->x[j * ths->d + t];
277  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
278  }
279  omega = Omega[ths->d];
280  for (k_L = 0; k_L < ths->N_total; k_L++)
281  {
282  f_hat[k_L] += f[j] * BASE(II * omega);
283 
284  for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
285  k[t]-= ths->N[t]-1;
286 
287  k[t]++;
288 
289  for (t2 = t; t2 < ths->d; t2++)
290  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
291 
292  omega = Omega[ths->d];
293  }
294  }
295 #endif
296  }
297 }
298 
324 static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op,
325  const INT act_dim)
326 {
327  const R xj = ths->x[j * ths->d + act_dim];
328  INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
329 
330  (*up) = c - (ths->m);
331  (*op) = c + 1 + (ths->m);
332 }
333 
334 static inline void uo2(INT *u, INT *o, const R x, const INT n, const INT m)
335 {
336  INT c = LRINT(FLOOR(x * (R)(n)));
337 
338  *u = (c - m + n) % n;
339  *o = (c + 1 + m + n) % n;
340 }
341 
342 #define MACRO_D_compute_A \
343 { \
344  g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
345 }
346 
347 #define MACRO_D_compute_T \
348 { \
349  f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
350 }
351 
352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
353 
354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
355 
356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
357 
358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
359 
360 #define MACRO_init_k_ks \
361 { \
362  for (t = ths->d-1; 0 <= t; t--) \
363  { \
364  kp[t] = k[t] = 0; \
365  ks[t] = ths->N[t]/2; \
366  } \
367  t++; \
368 }
369 
370 #define MACRO_update_c_phi_inv_k(which_one) \
371 { \
372  for (t2 = t; t2 < ths->d; t2++) \
373  { \
374  c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
375  ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
376  k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
377  } \
378 }
379 
380 #define MACRO_count_k_ks \
381 { \
382  for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
383  { \
384  kp[t] = k[t] = 0; \
385  ks[t]= ths->N[t]/2; \
386  } \
387 \
388  kp[t]++; k[t]++; ks[t]++; \
389  if(kp[t] == ths->N[t]/2) \
390  { \
391  k[t] = ths->n[t] - ths->N[t]/2; \
392  ks[t] = 0; \
393  } \
394 } \
395 
396 /* sub routines for the fast transforms matrix vector multiplication with D, D^T */
397 #define MACRO_D(which_one) \
398 static inline void D_serial_ ## which_one (X(plan) *ths) \
399 { \
400  C *f_hat, *g_hat; /* local copy */ \
401  R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \
402  INT t, t2; /* index dimensions */ \
403  INT k_L; /* plain index */ \
404  INT kp[ths->d]; /* multi index (simple) */ \
405  INT k[ths->d]; /* multi index in g_hat */ \
406  INT ks[ths->d]; /* multi index in f_hat, c_phi_inv*/ \
407  INT k_plain[ths->d+1]; /* postfix plain index */ \
408  INT ks_plain[ths->d+1]; /* postfix plain index */ \
409  \
410  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
411  MACRO_D_init_result_ ## which_one; \
412 \
413  c_phi_inv_k[0] = K(1.0); \
414  k_plain[0] = 0; \
415  ks_plain[0] = 0; \
416 \
417  MACRO_init_k_ks; \
418 \
419  if (ths->flags & PRE_PHI_HUT) \
420  { \
421  for (k_L = 0; k_L < ths->N_total; k_L++) \
422  { \
423  MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
424  MACRO_D_compute_ ## which_one; \
425  MACRO_count_k_ks; \
426  } \
427  } \
428  else \
429  { \
430  for (k_L = 0; k_L < ths->N_total; k_L++) \
431  { \
432  MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
433  MACRO_D_compute_ ## which_one; \
434  MACRO_count_k_ks; \
435  } \
436  } \
437 }
438 
439 #ifdef _OPENMP
440 static inline void D_openmp_A(X(plan) *ths)
441 {
442  C *f_hat, *g_hat;
443  INT k_L;
445  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
446  memset(g_hat, 0, ths->n_total * sizeof(C));
447 
448  if (ths->flags & PRE_PHI_HUT)
449  {
450  #pragma omp parallel for default(shared) private(k_L)
451  for (k_L = 0; k_L < ths->N_total; k_L++)
452  {
453  INT kp[ths->d]; //0..N-1
454  INT k[ths->d];
455  INT ks[ths->d];
456  R c_phi_inv_k_val = K(1.0);
457  INT k_plain_val = 0;
458  INT ks_plain_val = 0;
459  INT t;
460  INT k_temp = k_L;
461 
462  for (t = ths->d-1; t >= 0; t--)
463  {
464  kp[t] = k_temp % ths->N[t];
465  if (kp[t] >= ths->N[t]/2)
466  k[t] = ths->n[t] - ths->N[t] + kp[t];
467  else
468  k[t] = kp[t];
469  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
470  k_temp /= ths->N[t];
471  }
472 
473  for (t = 0; t < ths->d; t++)
474  {
475  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
476  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
477  k_plain_val = k_plain_val*ths->n[t] + k[t];
478  }
479 
480  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
481  } /* for(k_L) */
482  } /* if(PRE_PHI_HUT) */
483  else
484  {
485  #pragma omp parallel for default(shared) private(k_L)
486  for (k_L = 0; k_L < ths->N_total; k_L++)
487  {
488  INT kp[ths->d]; //0..N-1
489  INT k[ths->d];
490  INT ks[ths->d];
491  R c_phi_inv_k_val = K(1.0);
492  INT k_plain_val = 0;
493  INT ks_plain_val = 0;
494  INT t;
495  INT k_temp = k_L;
496 
497  for (t = ths->d-1; t >= 0; t--)
498  {
499  kp[t] = k_temp % ths->N[t];
500  if (kp[t] >= ths->N[t]/2)
501  k[t] = ths->n[t] - ths->N[t] + kp[t];
502  else
503  k[t] = kp[t];
504  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
505  k_temp /= ths->N[t];
506  }
507 
508  for (t = 0; t < ths->d; t++)
509  {
510  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
511  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
512  k_plain_val = k_plain_val*ths->n[t] + k[t];
513  }
514 
515  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
516  } /* for(k_L) */
517  } /* else(PRE_PHI_HUT) */
518 }
519 #endif
520 
521 #ifndef _OPENMP
522 MACRO_D(A)
523 #endif
524 
525 static inline void D_A(X(plan) *ths)
526 {
527 #ifdef _OPENMP
528  D_openmp_A(ths);
529 #else
530  D_serial_A(ths);
531 #endif
532 }
533 
534 #ifdef _OPENMP
535 static void D_openmp_T(X(plan) *ths)
536 {
537  C *f_hat, *g_hat;
538  INT k_L;
540  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
541  memset(f_hat, 0, ths->N_total * sizeof(C));
542 
543  if (ths->flags & PRE_PHI_HUT)
544  {
545  #pragma omp parallel for default(shared) private(k_L)
546  for (k_L = 0; k_L < ths->N_total; k_L++)
547  {
548  INT kp[ths->d]; //0..N-1
549  INT k[ths->d];
550  INT ks[ths->d];
551  R c_phi_inv_k_val = K(1.0);
552  INT k_plain_val = 0;
553  INT ks_plain_val = 0;
554  INT t;
555  INT k_temp = k_L;
556 
557  for (t = ths->d - 1; t >= 0; t--)
558  {
559  kp[t] = k_temp % ths->N[t];
560  if (kp[t] >= ths->N[t]/2)
561  k[t] = ths->n[t] - ths->N[t] + kp[t];
562  else
563  k[t] = kp[t];
564  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
565  k_temp /= ths->N[t];
566  }
567 
568  for (t = 0; t < ths->d; t++)
569  {
570  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
571  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
572  k_plain_val = k_plain_val*ths->n[t] + k[t];
573  }
574 
575  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
576  } /* for(k_L) */
577  } /* if(PRE_PHI_HUT) */
578  else
579  {
580  #pragma omp parallel for default(shared) private(k_L)
581  for (k_L = 0; k_L < ths->N_total; k_L++)
582  {
583  INT kp[ths->d]; //0..N-1
584  INT k[ths->d];
585  INT ks[ths->d];
586  R c_phi_inv_k_val = K(1.0);
587  INT k_plain_val = 0;
588  INT ks_plain_val = 0;
589  INT t;
590  INT k_temp = k_L;
591 
592  for (t = ths->d-1; t >= 0; t--)
593  {
594  kp[t] = k_temp % ths->N[t];
595  if (kp[t] >= ths->N[t]/2)
596  k[t] = ths->n[t] - ths->N[t] + kp[t];
597  else
598  k[t] = kp[t];
599  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
600  k_temp /= ths->N[t];
601  }
602 
603  for (t = 0; t < ths->d; t++)
604  {
605  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
606  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
607  k_plain_val = k_plain_val*ths->n[t] + k[t];
608  }
609 
610  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
611  } /* for(k_L) */
612  } /* else(PRE_PHI_HUT) */
613 }
614 #endif
615 
616 #ifndef _OPENMP
617 MACRO_D(T)
618 #endif
619 
620 static void D_T(X(plan) *ths)
621 {
622 #ifdef _OPENMP
623  D_openmp_T(ths);
624 #else
625  D_serial_T(ths);
626 #endif
627 }
628 
629 /* sub routines for the fast transforms matrix vector multiplication with B, B^T */
630 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
631 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
632 
633 #define MACRO_B_PRE_FULL_PSI_compute_A \
634 { \
635  (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
636 }
637 
638 #define MACRO_B_PRE_FULL_PSI_compute_T \
639 { \
640  g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
641 }
642 
643 #define MACRO_B_compute_A \
644 { \
645  (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
646 }
647 
648 #define MACRO_B_compute_T \
649 { \
650  g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
651 }
652 
653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
654 
655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
656 
657 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
658  - ((R)l[t2])/((R)ths->n[t2]), t2)
659 
660 #define MACRO_init_uo_l_lj_t \
661 { \
662  for (t = ths->d-1; t >= 0; t--) \
663  { \
664  uo(ths,j,&u[t],&o[t],t); \
665  l[t] = u[t]; \
666  lj[t] = 0; \
667  } \
668  t++; \
669 }
670 
671 #define MACRO_update_phi_prod_ll_plain(which_one) { \
672  for (t2 = t; t2 < ths->d; t2++) \
673  { \
674  phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
675  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \
676  } \
677 }
678 
679 #define MACRO_count_uo_l_lj_t \
680 { \
681  for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
682  { \
683  l[t] = u[t]; \
684  lj[t] = 0; \
685  } \
686  \
687  l[t]++; \
688  lj[t]++; \
689 }
690 
691 #define MACRO_B(which_one) \
692 static inline void B_serial_ ## which_one (X(plan) *ths) \
693 { \
694  INT lprod; /* 'regular bandwidth' of matrix B */ \
695  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
696  INT t, t2; /* index dimensions */ \
697  INT j; /* index nodes */ \
698  INT l_L, ix; /* index one row of B */ \
699  INT l[ths->d]; /* multi index u<=l<=o */ \
700  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
701  INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
702  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
703  C *f, *g; /* local copy */ \
704  C *fj; /* local copy */ \
705  R y[ths->d]; \
706  R fg_psi[ths->d][2*ths->m+2]; \
707  R fg_exp_l[ths->d][2*ths->m+2]; \
708  INT l_fg,lj_fg; \
709  R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
710  R ip_w; \
711  INT ip_u; \
712  INT ip_s = ths->K/(ths->m+2); \
713  \
714  f = (C*)ths->f; g = (C*)ths->g; \
715  \
716  MACRO_B_init_result_ ## which_one; \
717  \
718  if (ths->flags & PRE_FULL_PSI) \
719  { \
720  for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
721  { \
722  for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
723  { \
724  MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
725  } \
726  } \
727  return; \
728  } \
729 \
730  phi_prod[0] = K(1.0); \
731  ll_plain[0] = 0; \
732 \
733  for (t = 0, lprod = 1; t < ths->d; t++) \
734  lprod *= (2 * ths->m + 2); \
735 \
736  if (ths->flags & PRE_PSI) \
737  { \
738  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
739  { \
740  MACRO_init_uo_l_lj_t; \
741  \
742  for (l_L = 0; l_L < lprod; l_L++) \
743  { \
744  MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
745  \
746  MACRO_B_compute_ ## which_one; \
747  \
748  MACRO_count_uo_l_lj_t; \
749  } /* for(l_L) */ \
750  } /* for(j) */ \
751  return; \
752  } /* if(PRE_PSI) */ \
753  \
754  if (ths->flags & PRE_FG_PSI) \
755  { \
756  for(t2 = 0; t2 < ths->d; t2++) \
757  { \
758  tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
759  tmpEXP2sq = tmpEXP2*tmpEXP2; \
760  tmp2 = K(1.0); \
761  tmp3 = K(1.0); \
762  fg_exp_l[t2][0] = K(1.0); \
763  for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
764  { \
765  tmp3 = tmp2*tmpEXP2; \
766  tmp2 *= tmpEXP2sq; \
767  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
768  } \
769  } \
770  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
771  { \
772  MACRO_init_uo_l_lj_t; \
773  \
774  for (t2 = 0; t2 < ths->d; t2++) \
775  { \
776  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
777  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
778  tmp1 = K(1.0); \
779  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
780  { \
781  tmp1 *= tmpEXP1; \
782  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
783  } \
784  } \
785  \
786  for (l_L= 0; l_L < lprod; l_L++) \
787  { \
788  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
789  \
790  MACRO_B_compute_ ## which_one; \
791  \
792  MACRO_count_uo_l_lj_t; \
793  } /* for(l_L) */ \
794  } /* for(j) */ \
795  return; \
796  } /* if(PRE_FG_PSI) */ \
797  \
798  if (ths->flags & FG_PSI) \
799  { \
800  for (t2 = 0; t2 < ths->d; t2++) \
801  { \
802  tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
803  tmpEXP2sq = tmpEXP2*tmpEXP2; \
804  tmp2 = K(1.0); \
805  tmp3 = K(1.0); \
806  fg_exp_l[t2][0] = K(1.0); \
807  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
808  { \
809  tmp3 = tmp2*tmpEXP2; \
810  tmp2 *= tmpEXP2sq; \
811  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
812  } \
813  } \
814  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
815  { \
816  MACRO_init_uo_l_lj_t; \
817  \
818  for (t2 = 0; t2 < ths->d; t2++) \
819  { \
820  fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
821  \
822  tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
823  /ths->b[t2]); \
824  tmp1 = K(1.0); \
825  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
826  { \
827  tmp1 *= tmpEXP1; \
828  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
829  } \
830  } \
831  \
832  for (l_L = 0; l_L < lprod; l_L++) \
833  { \
834  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
835  \
836  MACRO_B_compute_ ## which_one; \
837  \
838  MACRO_count_uo_l_lj_t; \
839  } /* for(l_L) */ \
840  } /* for(j) */ \
841  return; \
842  } /* if(FG_PSI) */ \
843  \
844  if (ths->flags & PRE_LIN_PSI) \
845  { \
846  for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
847  { \
848  MACRO_init_uo_l_lj_t; \
849  \
850  for (t2 = 0; t2 < ths->d; t2++) \
851  { \
852  y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
853  * ((R)(ths->K))) / (R)(ths->m + 2); \
854  ip_u = LRINT(FLOOR(y[t2])); \
855  ip_w = y[t2]-ip_u; \
856  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
857  { \
858  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
859  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
860  * (ip_w); \
861  } \
862  } \
863  \
864  for (l_L = 0; l_L < lprod; l_L++) \
865  { \
866  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
867  \
868  MACRO_B_compute_ ## which_one; \
869  \
870  MACRO_count_uo_l_lj_t; \
871  } /* for(l_L) */ \
872  } /* for(j) */ \
873  return; \
874  } /* if(PRE_LIN_PSI) */ \
875  \
876  /* no precomputed psi at all */ \
877  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
878  { \
879  MACRO_init_uo_l_lj_t; \
880  \
881  for (l_L = 0; l_L < lprod; l_L++) \
882  { \
883  MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
884  \
885  MACRO_B_compute_ ## which_one; \
886  \
887  MACRO_count_uo_l_lj_t; \
888  } /* for(l_L) */ \
889  } /* for(j) */ \
890 } /* nfft_B */ \
891 
892 #ifndef _OPENMP
893 MACRO_B(A)
894 #endif
895 
896 #ifdef _OPENMP
897 static inline void B_openmp_A (X(plan) *ths)
898 {
899  INT lprod; /* 'regular bandwidth' of matrix B */
900  INT k;
901 
902  memset(ths->f, 0, ths->M_total * sizeof(C));
903 
904  for (k = 0, lprod = 1; k < ths->d; k++)
905  lprod *= (2*ths->m+2);
906 
907  if (ths->flags & PRE_FULL_PSI)
908  {
909  #pragma omp parallel for default(shared) private(k)
910  for (k = 0; k < ths->M_total; k++)
911  {
912  INT l;
913  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
914  ths->f[j] = K(0.0);
915  for (l = 0; l < lprod; l++)
916  ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
917  }
918  return;
919  }
920 
921  if (ths->flags & PRE_PSI)
922  {
923  #pragma omp parallel for default(shared) private(k)
924  for (k = 0; k < ths->M_total; k++)
925  {
926  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
927  INT t, t2; /* index dimensions */
928  INT l_L; /* index one row of B */
929  INT l[ths->d]; /* multi index u<=l<=o */
930  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
931  INT ll_plain[ths->d+1]; /* postfix plain index in g */
932  R phi_prod[ths->d+1]; /* postfix product of PHI */
933  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
934 
935  phi_prod[0] = K(1.0);
936  ll_plain[0] = 0;
937 
938  MACRO_init_uo_l_lj_t;
939 
940  for (l_L = 0; l_L < lprod; l_L++)
941  {
942  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
943 
944  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
945 
946  MACRO_count_uo_l_lj_t;
947  } /* for(l_L) */
948  } /* for(j) */
949  return;
950  } /* if(PRE_PSI) */
951 
952  if (ths->flags & PRE_FG_PSI)
953  {
954  INT t, t2; /* index dimensions */
955  R fg_exp_l[ths->d][2*ths->m+2];
956 
957  for (t2 = 0; t2 < ths->d; t2++)
958  {
959  INT lj_fg;
960  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
961  R tmpEXP2sq = tmpEXP2*tmpEXP2;
962  R tmp2 = K(1.0);
963  R tmp3 = K(1.0);
964  fg_exp_l[t2][0] = K(1.0);
965  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
966  {
967  tmp3 = tmp2*tmpEXP2;
968  tmp2 *= tmpEXP2sq;
969  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
970  }
971  }
972 
973  #pragma omp parallel for default(shared) private(k,t,t2)
974  for (k = 0; k < ths->M_total; k++)
975  {
976  INT ll_plain[ths->d+1]; /* postfix plain index in g */
977  R phi_prod[ths->d+1]; /* postfix product of PHI */
978  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
979  INT l[ths->d]; /* multi index u<=l<=o */
980  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
981  R fg_psi[ths->d][2*ths->m+2];
982  R tmpEXP1, tmp1;
983  INT l_fg,lj_fg;
984  INT l_L;
985  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
986 
987  phi_prod[0] = K(1.0);
988  ll_plain[0] = 0;
989 
990  MACRO_init_uo_l_lj_t;
991 
992  for (t2 = 0; t2 < ths->d; t2++)
993  {
994  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
995  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
996  tmp1 = K(1.0);
997  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
998  {
999  tmp1 *= tmpEXP1;
1000  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1001  }
1002  }
1003 
1004  for (l_L= 0; l_L < lprod; l_L++)
1005  {
1006  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1007 
1008  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1009 
1010  MACRO_count_uo_l_lj_t;
1011  } /* for(l_L) */
1012  } /* for(j) */
1013  return;
1014  } /* if(PRE_FG_PSI) */
1015 
1016  if (ths->flags & FG_PSI)
1017  {
1018  INT t, t2; /* index dimensions */
1019  R fg_exp_l[ths->d][2*ths->m+2];
1020 
1021  sort(ths);
1022 
1023  for (t2 = 0; t2 < ths->d; t2++)
1024  {
1025  INT lj_fg;
1026  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1027  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1028  R tmp2 = K(1.0);
1029  R tmp3 = K(1.0);
1030  fg_exp_l[t2][0] = K(1.0);
1031  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1032  {
1033  tmp3 = tmp2*tmpEXP2;
1034  tmp2 *= tmpEXP2sq;
1035  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1036  }
1037  }
1038 
1039  #pragma omp parallel for default(shared) private(k,t,t2)
1040  for (k = 0; k < ths->M_total; k++)
1041  {
1042  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1043  R phi_prod[ths->d+1]; /* postfix product of PHI */
1044  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1045  INT l[ths->d]; /* multi index u<=l<=o */
1046  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1047  R fg_psi[ths->d][2*ths->m+2];
1048  R tmpEXP1, tmp1;
1049  INT l_fg,lj_fg;
1050  INT l_L;
1051  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1052 
1053  phi_prod[0] = K(1.0);
1054  ll_plain[0] = 0;
1055 
1056  MACRO_init_uo_l_lj_t;
1057 
1058  for (t2 = 0; t2 < ths->d; t2++)
1059  {
1060  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1061 
1062  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1063  /ths->b[t2]);
1064  tmp1 = K(1.0);
1065  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1066  {
1067  tmp1 *= tmpEXP1;
1068  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1069  }
1070  }
1071 
1072  for (l_L = 0; l_L < lprod; l_L++)
1073  {
1074  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1075 
1076  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1077 
1078  MACRO_count_uo_l_lj_t;
1079  } /* for(l_L) */
1080  } /* for(j) */
1081  return;
1082  } /* if(FG_PSI) */
1083 
1084  if (ths->flags & PRE_LIN_PSI)
1085  {
1086  sort(ths);
1087 
1088  #pragma omp parallel for default(shared) private(k)
1089  for (k = 0; k<ths->M_total; k++)
1090  {
1091  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1092  INT t, t2; /* index dimensions */
1093  INT l_L; /* index one row of B */
1094  INT l[ths->d]; /* multi index u<=l<=o */
1095  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1096  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1097  R phi_prod[ths->d+1]; /* postfix product of PHI */
1098  R y[ths->d];
1099  R fg_psi[ths->d][2*ths->m+2];
1100  INT l_fg,lj_fg;
1101  R ip_w;
1102  INT ip_u;
1103  INT ip_s = ths->K/(ths->m+2);
1104  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1105 
1106  phi_prod[0] = K(1.0);
1107  ll_plain[0] = 0;
1108 
1109  MACRO_init_uo_l_lj_t;
1110 
1111  for (t2 = 0; t2 < ths->d; t2++)
1112  {
1113  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1114  * ((R)ths->K))/(ths->m+2);
1115  ip_u = LRINT(FLOOR(y[t2]));
1116  ip_w = y[t2]-ip_u;
1117  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1118  {
1119  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1120  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1121  * (ip_w);
1122  }
1123  }
1124 
1125  for (l_L = 0; l_L < lprod; l_L++)
1126  {
1127  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1128 
1129  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1130 
1131  MACRO_count_uo_l_lj_t;
1132  } /* for(l_L) */
1133  } /* for(j) */
1134  return;
1135  } /* if(PRE_LIN_PSI) */
1136 
1137  /* no precomputed psi at all */
1138  sort(ths);
1139 
1140  #pragma omp parallel for default(shared) private(k)
1141  for (k = 0; k < ths->M_total; k++)
1142  {
1143  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1144  INT t, t2; /* index dimensions */
1145  INT l_L; /* index one row of B */
1146  INT l[ths->d]; /* multi index u<=l<=o */
1147  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1148  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1149  R phi_prod[ths->d+1]; /* postfix product of PHI */
1150  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1151 
1152  phi_prod[0] = K(1.0);
1153  ll_plain[0] = 0;
1154 
1155  MACRO_init_uo_l_lj_t;
1156 
1157  for (l_L = 0; l_L < lprod; l_L++)
1158  {
1159  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1160 
1161  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1162 
1163  MACRO_count_uo_l_lj_t;
1164  } /* for(l_L) */
1165  } /* for(j) */
1166 }
1167 #endif
1168 
1169 static void B_A(X(plan) *ths)
1170 {
1171 #ifdef _OPENMP
1172  B_openmp_A(ths);
1173 #else
1174  B_serial_A(ths);
1175 #endif
1176 }
1177 
1178 #ifdef _OPENMP
1179 
1194 static inline INT index_x_binary_search(const INT *ar_x, const INT len, const INT key)
1195 {
1196  INT left = 0, right = len - 1;
1197 
1198  if (len == 1)
1199  return 0;
1200 
1201  while (left < right - 1)
1202  {
1203  INT i = (left + right) / 2;
1204  if (ar_x[2*i] >= key)
1205  right = i;
1206  else if (ar_x[2*i] < key)
1207  left = i;
1208  }
1209 
1210  if (ar_x[2*left] < key && left != len-1)
1211  return left+1;
1212 
1213  return left;
1214 }
1215 #endif
1216 
1217 #ifdef _OPENMP
1218 
1233 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1234  INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b, const INT d,
1235  const INT *n, const INT m)
1236 {
1237  const INT n0 = n[0];
1238  INT k;
1239  INT nthreads = omp_get_num_threads();
1240  INT nthreads_used = MIN(nthreads, n0);
1241  INT size_per_thread = n0 / nthreads_used;
1242  INT size_left = n0 - size_per_thread * nthreads_used;
1243  INT size_g[nthreads_used];
1244  INT offset_g[nthreads_used];
1245  INT my_id = omp_get_thread_num();
1246  INT n_prod_rest = 1;
1247 
1248  for (k = 1; k < d; k++)
1249  n_prod_rest *= n[k];
1250 
1251  *min_u_a = -1;
1252  *max_u_a = -1;
1253  *min_u_b = -1;
1254  *max_u_b = -1;
1255  *my_u0 = -1;
1256  *my_o0 = -1;
1257 
1258  if (my_id < nthreads_used)
1259  {
1260  const INT m22 = 2 * m + 2;
1261 
1262  offset_g[0] = 0;
1263  for (k = 0; k < nthreads_used; k++)
1264  {
1265  if (k > 0)
1266  offset_g[k] = offset_g[k-1] + size_g[k-1];
1267  size_g[k] = size_per_thread;
1268  if (size_left > 0)
1269  {
1270  size_g[k]++;
1271  size_left--;
1272  }
1273  }
1274 
1275  *my_u0 = offset_g[my_id];
1276  *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1277 
1278  if (nthreads_used > 1)
1279  {
1280  *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1281  *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1282  }
1283  else
1284  {
1285  *min_u_a = 0;
1286  *max_u_a = n_prod_rest * n0 - 1;
1287  }
1288 
1289  if (*min_u_a < 0)
1290  {
1291  *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1292  *max_u_b = n_prod_rest * n0 - 1;
1293  *min_u_a = 0;
1294  }
1295 
1296  if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1297  {
1298  *max_u_a = *max_u_b;
1299  *min_u_b = -1;
1300  *max_u_b = -1;
1301  }
1302 #ifdef OMP_ASSERT
1303  assert(*min_u_a <= *max_u_a);
1304  assert(*min_u_b <= *max_u_b);
1305  assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1306 #endif
1307  }
1308 }
1309 #endif
1310 
1319 static void nfft_adjoint_B_compute_full_psi(C *g, const INT *psi_index_g,
1320  const R *psi, const C *f, const INT M, const INT d, const INT *n,
1321  const INT m, const unsigned flags, const INT *index_x)
1322 {
1323  INT k;
1324  INT lprod;
1325 #ifdef _OPENMP
1326  INT lprod_m1;
1327 #endif
1328 #ifndef _OPENMP
1329  UNUSED(n);
1330 #endif
1331  {
1332  INT t;
1333  for(t = 0, lprod = 1; t < d; t++)
1334  lprod *= 2 * m + 2;
1335  }
1336 #ifdef _OPENMP
1337  lprod_m1 = lprod / (2 * m + 2);
1338 #endif
1339 
1340 #ifdef _OPENMP
1341  if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1342  {
1343  #pragma omp parallel private(k)
1344  {
1345  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1346  const INT *ar_x = index_x;
1347  INT n_prod_rest = 1;
1348 
1349  for (k = 1; k < d; k++)
1350  n_prod_rest *= n[k];
1351 
1352  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1353 
1354  if (min_u_a != -1)
1355  {
1356  k = index_x_binary_search(ar_x, M, min_u_a);
1357 #ifdef OMP_ASSERT
1358  assert(ar_x[2*k] >= min_u_a || k == M-1);
1359  if (k > 0)
1360  assert(ar_x[2*k-2] < min_u_a);
1361 #endif
1362  while (k < M)
1363  {
1364  INT l0, lrest;
1365  INT u_prod = ar_x[2*k];
1366  INT j = ar_x[2*k+1];
1367 
1368  if (u_prod < min_u_a || u_prod > max_u_a)
1369  break;
1370 
1371  for (l0 = 0; l0 < 2 * m + 2; l0++)
1372  {
1373  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1374 
1375  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1376  continue;
1377 
1378  for (lrest = 0; lrest < lprod_m1; lrest++)
1379  {
1380  const INT l = l0 * lprod_m1 + lrest;
1381  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1382  }
1383  }
1384 
1385  k++;
1386  }
1387  }
1388 
1389  if (min_u_b != -1)
1390  {
1391  k = index_x_binary_search(ar_x, M, min_u_b);
1392 #ifdef OMP_ASSERT
1393  assert(ar_x[2*k] >= min_u_b || k == M-1);
1394  if (k > 0)
1395  assert(ar_x[2*k-2] < min_u_b);
1396 #endif
1397  while (k < M)
1398  {
1399  INT l0, lrest;
1400  INT u_prod = ar_x[2*k];
1401  INT j = ar_x[2*k+1];
1402 
1403  if (u_prod < min_u_b || u_prod > max_u_b)
1404  break;
1405 
1406  for (l0 = 0; l0 < 2 * m + 2; l0++)
1407  {
1408  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1409 
1410  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1411  continue;
1412  for (lrest = 0; lrest < lprod_m1; lrest++)
1413  {
1414  const INT l = l0 * lprod_m1 + lrest;
1415  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1416  }
1417  }
1418 
1419  k++;
1420  }
1421  }
1422  } /* omp parallel */
1423  return;
1424  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
1425 #endif
1426 
1427 #ifdef _OPENMP
1428  #pragma omp parallel for default(shared) private(k)
1429 #endif
1430  for (k = 0; k < M; k++)
1431  {
1432  INT l;
1433  INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1434 
1435  for (l = 0; l < lprod; l++)
1436  {
1437 #ifdef _OPENMP
1438  C val = psi[j * lprod + l] * f[j];
1439  C *gref = g + psi_index_g[j * lprod + l];
1440  R *gref_real = (R*) gref;
1441 
1442  #pragma omp atomic
1443  gref_real[0] += CREAL(val);
1444 
1445  #pragma omp atomic
1446  gref_real[1] += CIMAG(val);
1447 #else
1448  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1449 #endif
1450  }
1451  }
1452 }
1453 
1454 #ifndef _OPENMP
1455 MACRO_B(T)
1456 #endif
1457 
1458 #ifdef _OPENMP
1459 static inline void B_openmp_T(X(plan) *ths)
1460 {
1461  INT lprod; /* 'regular bandwidth' of matrix B */
1462  INT k;
1463 
1464  memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
1465 
1466  for (k = 0, lprod = 1; k < ths->d; k++)
1467  lprod *= (2*ths->m+2);
1468 
1469  if (ths->flags & PRE_FULL_PSI)
1470  {
1471  nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1472  ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1473  return;
1474  }
1475 
1476  if (ths->flags & PRE_PSI)
1477  {
1478  #pragma omp parallel for default(shared) private(k)
1479  for (k = 0; k < ths->M_total; k++)
1480  {
1481  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1482  INT t, t2; /* index dimensions */
1483  INT l_L; /* index one row of B */
1484  INT l[ths->d]; /* multi index u<=l<=o */
1485  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1486  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1487  R phi_prod[ths->d+1]; /* postfix product of PHI */
1488  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1489 
1490  phi_prod[0] = K(1.0);
1491  ll_plain[0] = 0;
1492 
1493  MACRO_init_uo_l_lj_t;
1494 
1495  for (l_L = 0; l_L < lprod; l_L++)
1496  {
1497  C *lhs;
1498  R *lhs_real;
1499  C val;
1500 
1501  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1502 
1503  lhs = ths->g + ll_plain[ths->d];
1504  lhs_real = (R*)lhs;
1505  val = phi_prod[ths->d] * ths->f[j];
1506 
1507  #pragma omp atomic
1508  lhs_real[0] += CREAL(val);
1509 
1510  #pragma omp atomic
1511  lhs_real[1] += CIMAG(val);
1512 
1513  MACRO_count_uo_l_lj_t;
1514  } /* for(l_L) */
1515  } /* for(j) */
1516  return;
1517  } /* if(PRE_PSI) */
1518 
1519  if (ths->flags & PRE_FG_PSI)
1520  {
1521  INT t, t2; /* index dimensions */
1522  R fg_exp_l[ths->d][2*ths->m+2];
1523  for(t2 = 0; t2 < ths->d; t2++)
1524  {
1525  INT lj_fg;
1526  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1527  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1528  R tmp2 = K(1.0);
1529  R tmp3 = K(1.0);
1530  fg_exp_l[t2][0] = K(1.0);
1531  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1532  {
1533  tmp3 = tmp2*tmpEXP2;
1534  tmp2 *= tmpEXP2sq;
1535  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1536  }
1537  }
1538 
1539  #pragma omp parallel for default(shared) private(k,t,t2)
1540  for (k = 0; k < ths->M_total; k++)
1541  {
1542  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1543  R phi_prod[ths->d+1]; /* postfix product of PHI */
1544  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1545  INT l[ths->d]; /* multi index u<=l<=o */
1546  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1547  R fg_psi[ths->d][2*ths->m+2];
1548  R tmpEXP1, tmp1;
1549  INT l_fg,lj_fg;
1550  INT l_L;
1551  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1552 
1553  phi_prod[0] = K(1.0);
1554  ll_plain[0] = 0;
1555 
1556  MACRO_init_uo_l_lj_t;
1557 
1558  for (t2 = 0; t2 < ths->d; t2++)
1559  {
1560  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1561  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1562  tmp1 = K(1.0);
1563  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1564  {
1565  tmp1 *= tmpEXP1;
1566  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1567  }
1568  }
1569 
1570  for (l_L= 0; l_L < lprod; l_L++)
1571  {
1572  C *lhs;
1573  R *lhs_real;
1574  C val;
1575 
1576  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1577 
1578  lhs = ths->g + ll_plain[ths->d];
1579  lhs_real = (R*)lhs;
1580  val = phi_prod[ths->d] * ths->f[j];
1581 
1582  #pragma omp atomic
1583  lhs_real[0] += CREAL(val);
1584 
1585  #pragma omp atomic
1586  lhs_real[1] += CIMAG(val);
1587 
1588  MACRO_count_uo_l_lj_t;
1589  } /* for(l_L) */
1590  } /* for(j) */
1591  return;
1592  } /* if(PRE_FG_PSI) */
1593 
1594  if (ths->flags & FG_PSI)
1595  {
1596  INT t, t2; /* index dimensions */
1597  R fg_exp_l[ths->d][2*ths->m+2];
1598 
1599  sort(ths);
1600 
1601  for (t2 = 0; t2 < ths->d; t2++)
1602  {
1603  INT lj_fg;
1604  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1605  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1606  R tmp2 = K(1.0);
1607  R tmp3 = K(1.0);
1608  fg_exp_l[t2][0] = K(1.0);
1609  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1610  {
1611  tmp3 = tmp2*tmpEXP2;
1612  tmp2 *= tmpEXP2sq;
1613  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1614  }
1615  }
1616 
1617  #pragma omp parallel for default(shared) private(k,t,t2)
1618  for (k = 0; k < ths->M_total; k++)
1619  {
1620  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1621  R phi_prod[ths->d+1]; /* postfix product of PHI */
1622  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1623  INT l[ths->d]; /* multi index u<=l<=o */
1624  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1625  R fg_psi[ths->d][2*ths->m+2];
1626  R tmpEXP1, tmp1;
1627  INT l_fg,lj_fg;
1628  INT l_L;
1629  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1630 
1631  phi_prod[0] = K(1.0);
1632  ll_plain[0] = 0;
1633 
1634  MACRO_init_uo_l_lj_t;
1635 
1636  for (t2 = 0; t2 < ths->d; t2++)
1637  {
1638  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1639 
1640  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1641  /ths->b[t2]);
1642  tmp1 = K(1.0);
1643  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1644  {
1645  tmp1 *= tmpEXP1;
1646  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1647  }
1648  }
1649 
1650  for (l_L = 0; l_L < lprod; l_L++)
1651  {
1652  C *lhs;
1653  R *lhs_real;
1654  C val;
1655 
1656  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1657 
1658  lhs = ths->g + ll_plain[ths->d];
1659  lhs_real = (R*)lhs;
1660  val = phi_prod[ths->d] * ths->f[j];
1661 
1662  #pragma omp atomic
1663  lhs_real[0] += CREAL(val);
1664 
1665  #pragma omp atomic
1666  lhs_real[1] += CIMAG(val);
1667 
1668  MACRO_count_uo_l_lj_t;
1669  } /* for(l_L) */
1670  } /* for(j) */
1671  return;
1672  } /* if(FG_PSI) */
1673 
1674  if (ths->flags & PRE_LIN_PSI)
1675  {
1676  sort(ths);
1677 
1678  #pragma omp parallel for default(shared) private(k)
1679  for (k = 0; k<ths->M_total; k++)
1680  {
1681  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1682  INT t, t2; /* index dimensions */
1683  INT l_L; /* index one row of B */
1684  INT l[ths->d]; /* multi index u<=l<=o */
1685  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1686  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1687  R phi_prod[ths->d+1]; /* postfix product of PHI */
1688  R y[ths->d];
1689  R fg_psi[ths->d][2*ths->m+2];
1690  INT l_fg,lj_fg;
1691  R ip_w;
1692  INT ip_u;
1693  INT ip_s = ths->K/(ths->m+2);
1694  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1695 
1696  phi_prod[0] = K(1.0);
1697  ll_plain[0] = 0;
1698 
1699  MACRO_init_uo_l_lj_t;
1700 
1701  for (t2 = 0; t2 < ths->d; t2++)
1702  {
1703  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1704  * ((R)ths->K))/(ths->m+2);
1705  ip_u = LRINT(FLOOR(y[t2]));
1706  ip_w = y[t2]-ip_u;
1707  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1708  {
1709  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1710  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1711  * (ip_w);
1712  }
1713  }
1714 
1715  for (l_L = 0; l_L < lprod; l_L++)
1716  {
1717  C *lhs;
1718  R *lhs_real;
1719  C val;
1720 
1721  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1722 
1723  lhs = ths->g + ll_plain[ths->d];
1724  lhs_real = (R*)lhs;
1725  val = phi_prod[ths->d] * ths->f[j];
1726 
1727  #pragma omp atomic
1728  lhs_real[0] += CREAL(val);
1729 
1730  #pragma omp atomic
1731  lhs_real[1] += CIMAG(val);
1732 
1733  MACRO_count_uo_l_lj_t;
1734  } /* for(l_L) */
1735  } /* for(j) */
1736  return;
1737  } /* if(PRE_LIN_PSI) */
1738 
1739  /* no precomputed psi at all */
1740  sort(ths);
1741 
1742  #pragma omp parallel for default(shared) private(k)
1743  for (k = 0; k < ths->M_total; k++)
1744  {
1745  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1746  INT t, t2; /* index dimensions */
1747  INT l_L; /* index one row of B */
1748  INT l[ths->d]; /* multi index u<=l<=o */
1749  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1750  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1751  R phi_prod[ths->d+1]; /* postfix product of PHI */
1752  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1753 
1754  phi_prod[0] = K(1.0);
1755  ll_plain[0] = 0;
1756 
1757  MACRO_init_uo_l_lj_t;
1758 
1759  for (l_L = 0; l_L < lprod; l_L++)
1760  {
1761  C *lhs;
1762  R *lhs_real;
1763  C val;
1764 
1765  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1766 
1767  lhs = ths->g + ll_plain[ths->d];
1768  lhs_real = (R*)lhs;
1769  val = phi_prod[ths->d] * ths->f[j];
1770 
1771  #pragma omp atomic
1772  lhs_real[0] += CREAL(val);
1773 
1774  #pragma omp atomic
1775  lhs_real[1] += CIMAG(val);
1776 
1777  MACRO_count_uo_l_lj_t;
1778  } /* for(l_L) */
1779  } /* for(j) */
1780 }
1781 #endif
1782 
1783 static void B_T(X(plan) *ths)
1784 {
1785 #ifdef _OPENMP
1786  B_openmp_T(ths);
1787 #else
1788  B_serial_T(ths);
1789 #endif
1790 }
1791 
1792 /* ## specialized version for d=1 ########################################### */
1793 
1794 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
1795 {
1796  const INT tmp2 = 2*m+2;
1797  INT l;
1798  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1799 
1800  fg_exp_b0 = EXP(K(-1.0)/b);
1801  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1802  fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1803 
1804  for (l = 1; l < tmp2; l++)
1805  {
1806  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1807  fg_exp_b1 *= fg_exp_b0_sq;
1808  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1809  }
1810 }
1811 
1812 
1813 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
1814  const R *xj, const INT n, const INT m)
1815 {
1816  INT u, o, l;
1817  const C *gj;
1818  const R *psij;
1819  psij = psij_const;
1820 
1821  uo2(&u, &o, *xj, n, m);
1822 
1823  if (u < o)
1824  {
1825  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1826  (*fj) += (*psij++) * (*gj++);
1827  }
1828  else
1829  {
1830  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1831  (*fj) += (*psij++) * (*gj++);
1832  for (l = 0, gj = g; l <= o; l++)
1833  (*fj) += (*psij++) * (*gj++);
1834  }
1835 }
1836 
1837 #ifndef _OPENMP
1838 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,
1839  const R *psij_const, const R *xj, const INT n, const INT m)
1840 {
1841  INT u,o,l;
1842  C *gj;
1843  const R *psij;
1844  psij = psij_const;
1845 
1846  uo2(&u,&o,*xj, n, m);
1847 
1848  if (u < o)
1849  {
1850  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1851  (*gj++) += (*psij++) * (*fj);
1852  }
1853  else
1854  {
1855  for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1856  (*gj++) += (*psij++) * (*fj);
1857  for (l = 0, gj = g; l <= o; l++)
1858  (*gj++) += (*psij++) * (*fj);
1859  }
1860 }
1861 #endif
1862 
1863 #ifdef _OPENMP
1864 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */
1865 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,
1866  const R *psij_const, const R *xj, const INT n, const INT m)
1867 {
1868  INT u,o,l;
1869  C *gj;
1870  INT index_temp[2*m+2];
1871 
1872  uo2(&u,&o,*xj, n, m);
1873 
1874  for (l=0; l<=2*m+1; l++)
1875  index_temp[l] = (l+u)%n;
1876 
1877  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1878  {
1879  INT i = index_temp[l];
1880  C *lhs = g+i;
1881  R *lhs_real = (R*)lhs;
1882  C val = psij_const[l] * f;
1883  #pragma omp atomic
1884  lhs_real[0] += CREAL(val);
1885 
1886  #pragma omp atomic
1887  lhs_real[1] += CIMAG(val);
1888  }
1889 }
1890 #endif
1891 
1892 #ifdef _OPENMP
1893 
1908 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,
1909  const R *psij_const, const R *xj, const INT n, const INT m,
1910  const INT my_u0, const INT my_o0)
1911 {
1912  INT ar_u,ar_o,l;
1913 
1914  uo2(&ar_u,&ar_o,*xj, n, m);
1915 
1916  if (ar_u < ar_o)
1917  {
1918  INT u = MAX(my_u0,ar_u);
1919  INT o = MIN(my_o0,ar_o);
1920  INT offset_psij = u-ar_u;
1921 #ifdef OMP_ASSERT
1922  assert(offset_psij >= 0);
1923  assert(o-u <= 2*m+1);
1924  assert(offset_psij+o-u <= 2*m+1);
1925 #endif
1926 
1927  for (l = 0; l <= o-u; l++)
1928  g[u+l] += psij_const[offset_psij+l] * f;
1929  }
1930  else
1931  {
1932  INT u = MAX(my_u0,ar_u);
1933  INT o = my_o0;
1934  INT offset_psij = u-ar_u;
1935 #ifdef OMP_ASSERT
1936  assert(offset_psij >= 0);
1937  assert(o-u <= 2*m+1);
1938  assert(offset_psij+o-u <= 2*m+1);
1939 #endif
1940 
1941  for (l = 0; l <= o-u; l++)
1942  g[u+l] += psij_const[offset_psij+l] * f;
1943 
1944  u = my_u0;
1945  o = MIN(my_o0,ar_o);
1946  offset_psij += my_u0-ar_u+n;
1947 
1948 #ifdef OMP_ASSERT
1949  if (u <= o)
1950  {
1951  assert(o-u <= 2*m+1);
1952  if (offset_psij+o-u > 2*m+1)
1953  {
1954  fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
1955  }
1956  assert(offset_psij+o-u <= 2*m+1);
1957  }
1958 #endif
1959  for (l = 0; l <= o-u; l++)
1960  g[u+l] += psij_const[offset_psij+l] * f;
1961  }
1962 }
1963 #endif
1964 
1965 static void nfft_trafo_1d_B(X(plan) *ths)
1966 {
1967  const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
1968  const C *g = (C*)ths->g;
1969 
1970  if (ths->flags & PRE_FULL_PSI)
1971  {
1972  INT k;
1973 #ifdef _OPENMP
1974  #pragma omp parallel for default(shared) private(k)
1975 #endif
1976  for (k = 0; k < M; k++)
1977  {
1978  INT l;
1979  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1980  ths->f[j] = K(0.0);
1981  for (l = 0; l < m2p2; l++)
1982  ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
1983  }
1984  return;
1985  } /* if(PRE_FULL_PSI) */
1986 
1987  if (ths->flags & PRE_PSI)
1988  {
1989  INT k;
1990 #ifdef _OPENMP
1991  #pragma omp parallel for default(shared) private(k)
1992 #endif
1993  for (k = 0; k < M; k++)
1994  {
1995  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1996  nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
1997  &ths->x[j], n, m);
1998  }
1999  return;
2000  } /* if(PRE_PSI) */
2001 
2002  if (ths->flags & PRE_FG_PSI)
2003  {
2004  INT k;
2005  R fg_exp_l[m2p2];
2006 
2007  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2008 
2009 #ifdef _OPENMP
2010  #pragma omp parallel for default(shared) private(k)
2011 #endif
2012  for (k = 0; k < M; k++)
2013  {
2014  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2015  const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2016  R fg_psij2 = K(1.0);
2017  R psij_const[m2p2];
2018  INT l;
2019 
2020  psij_const[0] = fg_psij0;
2021 
2022  for (l = 1; l < m2p2; l++)
2023  {
2024  fg_psij2 *= fg_psij1;
2025  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2026  }
2027 
2028  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2029  }
2030 
2031  return;
2032  } /* if(PRE_FG_PSI) */
2033 
2034  if (ths->flags & FG_PSI)
2035  {
2036  INT k;
2037  R fg_exp_l[m2p2];
2038 
2039  sort(ths);
2040 
2041  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2042 
2043 #ifdef _OPENMP
2044  #pragma omp parallel for default(shared) private(k)
2045 #endif
2046  for (k = 0; k < M; k++)
2047  {
2048  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2049  INT u, o, l;
2050  R fg_psij0, fg_psij1, fg_psij2;
2051  R psij_const[m2p2];
2052 
2053  uo(ths, (INT)j, &u, &o, (INT)0);
2054  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2055  fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2056  fg_psij2 = K(1.0);
2057 
2058  psij_const[0] = fg_psij0;
2059 
2060  for (l = 1; l < m2p2; l++)
2061  {
2062  fg_psij2 *= fg_psij1;
2063  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2064  }
2065 
2066  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2067  }
2068  return;
2069  } /* if(FG_PSI) */
2070 
2071  if (ths->flags & PRE_LIN_PSI)
2072  {
2073  const INT K = ths->K, ip_s = K / (m + 2);
2074  INT k;
2075 
2076  sort(ths);
2077 
2078 #ifdef _OPENMP
2079  #pragma omp parallel for default(shared) private(k)
2080 #endif
2081  for (k = 0; k < M; k++)
2082  {
2083  INT u, o, l;
2084  R ip_y, ip_w;
2085  INT ip_u;
2086  R psij_const[m2p2];
2087  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2088 
2089  uo(ths, (INT)j, &u, &o, (INT)0);
2090 
2091  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2092  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2093  ip_w = ip_y - (R)(ip_u);
2094 
2095  for (l = 0; l < m2p2; l++)
2096  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2097  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2098 
2099  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2100  }
2101  return;
2102  } /* if(PRE_LIN_PSI) */
2103  else
2104  {
2105  /* no precomputed psi at all */
2106  INT k;
2107 
2108  sort(ths);
2109 
2110 #ifdef _OPENMP
2111  #pragma omp parallel for default(shared) private(k)
2112 #endif
2113  for (k = 0; k < M; k++)
2114  {
2115  R psij_const[m2p2];
2116  INT u, o, l;
2117  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2118 
2119  uo(ths, (INT)j, &u, &o, (INT)0);
2120 
2121  for (l = 0; l < m2p2; l++)
2122  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2123 
2124  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2125  }
2126  }
2127 }
2128 
2129 
2130 #ifdef OMP_ASSERT
2131 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2132 { \
2133  assert(ar_x[2*k] >= min_u_a || k == M-1); \
2134  if (k > 0) \
2135  assert(ar_x[2*k-2] < min_u_a); \
2136 }
2137 #else
2138 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2139 #endif
2140 
2141 #ifdef OMP_ASSERT
2142 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2143 { \
2144  assert(ar_x[2*k] >= min_u_b || k == M-1); \
2145  if (k > 0) \
2146  assert(ar_x[2*k-2] < min_u_b); \
2147 }
2148 #else
2149 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2150 #endif
2151 
2152 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2153 { \
2154  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2155  ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2156 }
2157 
2158 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2159 { \
2160  R psij_const[2 * m + 2]; \
2161  INT u, o, l; \
2162  R fg_psij0 = ths->psi[2 * j]; \
2163  R fg_psij1 = ths->psi[2 * j + 1]; \
2164  R fg_psij2 = K(1.0); \
2165  \
2166  psij_const[0] = fg_psij0; \
2167  for (l = 1; l <= 2 * m + 1; l++) \
2168  { \
2169  fg_psij2 *= fg_psij1; \
2170  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2171  } \
2172  \
2173  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2174  ths->x + j, n, m, my_u0, my_o0); \
2175 }
2176 
2177 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2178 { \
2179  R psij_const[2 * m + 2]; \
2180  R fg_psij0, fg_psij1, fg_psij2; \
2181  INT u, o, l; \
2182  \
2183  uo(ths, j, &u, &o, (INT)0); \
2184  fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \
2185  fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2186  fg_psij2 = K(1.0); \
2187  psij_const[0] = fg_psij0; \
2188  for (l = 1; l <= 2 * m + 1; l++) \
2189  { \
2190  fg_psij2 *= fg_psij1; \
2191  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2192  } \
2193  \
2194  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2195  ths->x + j, n, m, my_u0, my_o0); \
2196 }
2197 
2198 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2199 { \
2200  R psij_const[2 * m + 2]; \
2201  INT ip_u; \
2202  R ip_y, ip_w; \
2203  INT u, o, l; \
2204  \
2205  uo(ths, j, &u, &o, (INT)0); \
2206  \
2207  ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2208  ip_u = LRINT(FLOOR(ip_y)); \
2209  ip_w = ip_y - ip_u; \
2210  for (l = 0; l < 2 * m + 2; l++) \
2211  psij_const[l] \
2212  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2213  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2214  \
2215  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2216  ths->x + j, n, m, my_u0, my_o0); \
2217 }
2218 
2219 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2220 { \
2221  R psij_const[2 * m + 2]; \
2222  INT u, o, l; \
2223  \
2224  uo(ths, j, &u, &o, (INT)0); \
2225  \
2226  for (l = 0; l <= 2 * m + 1; l++) \
2227  psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \
2228  \
2229  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2230  ths->x + j, n, m, my_u0, my_o0); \
2231 }
2232 
2233 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2234 { \
2235  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2236  { \
2237  _Pragma("omp parallel private(k)") \
2238  { \
2239  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2240  INT *ar_x = ths->index_x; \
2241  \
2242  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2243  &min_u_b, &max_u_b, 1, &n, m); \
2244  \
2245  if (min_u_a != -1) \
2246  { \
2247  k = index_x_binary_search(ar_x, M, min_u_a); \
2248  \
2249  MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2250  \
2251  while (k < M) \
2252  { \
2253  INT u_prod = ar_x[2*k]; \
2254  INT j = ar_x[2*k+1]; \
2255  \
2256  if (u_prod < min_u_a || u_prod > max_u_a) \
2257  break; \
2258  \
2259  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2260  \
2261  k++; \
2262  } \
2263  } \
2264  \
2265  if (min_u_b != -1) \
2266  { \
2267  k = index_x_binary_search(ar_x, M, min_u_b); \
2268  \
2269  MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2270  \
2271  while (k < M) \
2272  { \
2273  INT u_prod = ar_x[2*k]; \
2274  INT j = ar_x[2*k+1]; \
2275  \
2276  if (u_prod < min_u_b || u_prod > max_u_b) \
2277  break; \
2278  \
2279  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2280  \
2281  k++; \
2282  } \
2283  } \
2284  } /* omp parallel */ \
2285  return; \
2286  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
2287 }
2288 
2289 static void nfft_adjoint_1d_B(X(plan) *ths)
2290 {
2291  const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2292  INT k;
2293  C *g = (C*)ths->g;
2294 
2295  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
2296 
2297  if (ths->flags & PRE_FULL_PSI)
2298  {
2299  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2300  (INT)1, ths->n, m, ths->flags, ths->index_x);
2301  return;
2302  } /* if(PRE_FULL_PSI) */
2303 
2304  if (ths->flags & PRE_PSI)
2305  {
2306 #ifdef _OPENMP
2307  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2308 #endif
2309 
2310 #ifdef _OPENMP
2311  #pragma omp parallel for default(shared) private(k)
2312 #endif
2313  for (k = 0; k < M; k++)
2314  {
2315  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2316 #ifdef _OPENMP
2317  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2318 #else
2319  nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2320 #endif
2321  }
2322 
2323  return;
2324  } /* if(PRE_PSI) */
2325 
2326  if (ths->flags & PRE_FG_PSI)
2327  {
2328  R fg_exp_l[2 * m + 2];
2329 
2330  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2331 
2332 #ifdef _OPENMP
2333  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2334 #endif
2335 
2336 
2337 #ifdef _OPENMP
2338  #pragma omp parallel for default(shared) private(k)
2339 #endif
2340  for (k = 0; k < M; k++)
2341  {
2342  R psij_const[2 * m + 2];
2343  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2344  INT l;
2345  R fg_psij0 = ths->psi[2 * j];
2346  R fg_psij1 = ths->psi[2 * j + 1];
2347  R fg_psij2 = K(1.0);
2348 
2349  psij_const[0] = fg_psij0;
2350  for (l = 1; l <= 2 * m + 1; l++)
2351  {
2352  fg_psij2 *= fg_psij1;
2353  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2354  }
2355 
2356 #ifdef _OPENMP
2357  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2358 #else
2359  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2360 #endif
2361  }
2362 
2363  return;
2364  } /* if(PRE_FG_PSI) */
2365 
2366  if (ths->flags & FG_PSI)
2367  {
2368  R fg_exp_l[2 * m + 2];
2369 
2370  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2371 
2372  sort(ths);
2373 
2374 #ifdef _OPENMP
2375  MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2376 #endif
2377 
2378 #ifdef _OPENMP
2379  #pragma omp parallel for default(shared) private(k)
2380 #endif
2381  for (k = 0; k < M; k++)
2382  {
2383  INT u,o,l;
2384  R psij_const[2 * m + 2];
2385  R fg_psij0, fg_psij1, fg_psij2;
2386  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2387 
2388  uo(ths, j, &u, &o, (INT)0);
2389  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2390  fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2391  fg_psij2 = K(1.0);
2392  psij_const[0] = fg_psij0;
2393  for (l = 1; l <= 2 * m + 1; l++)
2394  {
2395  fg_psij2 *= fg_psij1;
2396  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2397  }
2398 
2399 #ifdef _OPENMP
2400  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2401 #else
2402  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2403 #endif
2404  }
2405 
2406  return;
2407  } /* if(FG_PSI) */
2408 
2409  if (ths->flags & PRE_LIN_PSI)
2410  {
2411  const INT K = ths->K;
2412  const INT ip_s = K / (m + 2);
2413 
2414  sort(ths);
2415 
2416 #ifdef _OPENMP
2417  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2418 #endif
2419 
2420 #ifdef _OPENMP
2421  #pragma omp parallel for default(shared) private(k)
2422 #endif
2423  for (k = 0; k < M; k++)
2424  {
2425  INT u,o,l;
2426  INT ip_u;
2427  R ip_y, ip_w;
2428  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2429  R psij_const[2 * m + 2];
2430 
2431  uo(ths, j, &u, &o, (INT)0);
2432 
2433  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2434  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2435  ip_w = ip_y - (R)(ip_u);
2436  for (l = 0; l < 2 * m + 2; l++)
2437  psij_const[l]
2438  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2439  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2440 
2441 #ifdef _OPENMP
2442  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2443 #else
2444  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2445 #endif
2446  }
2447  return;
2448  } /* if(PRE_LIN_PSI) */
2449 
2450  /* no precomputed psi at all */
2451  sort(ths);
2452 
2453 #ifdef _OPENMP
2454  MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2455 #endif
2456 
2457 #ifdef _OPENMP
2458  #pragma omp parallel for default(shared) private(k)
2459 #endif
2460  for (k = 0; k < M; k++)
2461  {
2462  INT u,o,l;
2463  R psij_const[2 * m + 2];
2464  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2465 
2466  uo(ths, j, &u, &o, (INT)0);
2467 
2468  for (l = 0; l <= 2 * m + 1; l++)
2469  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2470 
2471 #ifdef _OPENMP
2472  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2473 #else
2474  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2475 #endif
2476  }
2477 }
2478 
2479 void X(trafo_1d)(X(plan) *ths)
2480 {
2481  if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2482  {
2483  X(trafo_direct)(ths);
2484  return;
2485  }
2486 
2487  const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2488  C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2489 
2490  ths->g_hat = ths->g1;
2491  ths->g = ths->g2;
2492 
2493  {
2494  C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2495  R *c_phi_inv1, *c_phi_inv2;
2496 
2497  TIC(0)
2498 #ifdef _OPENMP
2499  {
2500  INT k;
2501  #pragma omp parallel for default(shared) private(k)
2502  for (k = 0; k < ths->n_total; k++)
2503  ths->g_hat[k] = 0.0;
2504  }
2505 #else
2506  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
2507 #endif
2508  if(ths->flags & PRE_PHI_HUT)
2509  {
2510  INT k;
2511  c_phi_inv1 = ths->c_phi_inv[0];
2512  c_phi_inv2 = &ths->c_phi_inv[0][N2];
2513 
2514 #ifdef _OPENMP
2515  #pragma omp parallel for default(shared) private(k)
2516 #endif
2517  for (k = 0; k < N2; k++)
2518  {
2519  g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2520  g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2521  }
2522  }
2523  else
2524  {
2525  INT k;
2526 #ifdef _OPENMP
2527  #pragma omp parallel for default(shared) private(k)
2528 #endif
2529  for (k = 0; k < N2; k++)
2530  {
2531  g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2532  g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2533  }
2534  }
2535  TOC(0)
2536 
2537  TIC_FFTW(1)
2538  FFTW(execute)(ths->my_fftw_plan1);
2539  TOC_FFTW(1);
2540 
2541  TIC(2);
2542  nfft_trafo_1d_B(ths);
2543  TOC(2);
2544  }
2545 }
2546 
2547 void X(adjoint_1d)(X(plan) *ths)
2548 {
2549  if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2550  {
2551  X(adjoint_direct)(ths);
2552  return;
2553  }
2554 
2555  INT n,N;
2556  C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2557  R *c_phi_inv1, *c_phi_inv2;
2558 
2559  N=ths->N[0];
2560  n=ths->n[0];
2561 
2562  ths->g_hat=ths->g1;
2563  ths->g=ths->g2;
2564 
2565  f_hat1=(C*)ths->f_hat;
2566  f_hat2=(C*)&ths->f_hat[N/2];
2567  g_hat1=(C*)&ths->g_hat[n-N/2];
2568  g_hat2=(C*)ths->g_hat;
2569 
2570  TIC(2)
2571  nfft_adjoint_1d_B(ths);
2572  TOC(2)
2573 
2574  TIC_FFTW(1)
2575  FFTW(execute)(ths->my_fftw_plan2);
2576  TOC_FFTW(1);
2577 
2578  TIC(0)
2579  if(ths->flags & PRE_PHI_HUT)
2580  {
2581  INT k;
2582  c_phi_inv1=ths->c_phi_inv[0];
2583  c_phi_inv2=&ths->c_phi_inv[0][N/2];
2584 
2585 #ifdef _OPENMP
2586  #pragma omp parallel for default(shared) private(k)
2587 #endif
2588  for (k = 0; k < N/2; k++)
2589  {
2590  f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2591  f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2592  }
2593  }
2594  else
2595  {
2596  INT k;
2597 
2598 #ifdef _OPENMP
2599  #pragma omp parallel for default(shared) private(k)
2600 #endif
2601  for (k = 0; k < N/2; k++)
2602  {
2603  f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2604  f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2605  }
2606  }
2607  TOC(0)
2608 }
2609 
2610 
2611 /* ################################################ SPECIFIC VERSIONS FOR d=2 */
2612 
2613 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
2614 {
2615  INT l;
2616  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2617 
2618  fg_exp_b0 = EXP(K(-1.0)/b);
2619  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2620  fg_exp_b1 = K(1.0);
2621  fg_exp_b2 = K(1.0);
2622  fg_exp_l[0] = K(1.0);
2623  for(l=1; l <= 2*m+1; l++)
2624  {
2625  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2626  fg_exp_b1 *= fg_exp_b0_sq;
2627  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2628  }
2629 }
2630 
2631 static void nfft_trafo_2d_compute(C *fj, const C *g, const R *psij_const0,
2632  const R *psij_const1, const R *xj0, const R *xj1, const INT n0,
2633  const INT n1, const INT m)
2634 {
2635  INT u0,o0,l0,u1,o1,l1;
2636  const C *gj;
2637  const R *psij0,*psij1;
2638 
2639  psij0=psij_const0;
2640  psij1=psij_const1;
2641 
2642  uo2(&u0,&o0,*xj0, n0, m);
2643  uo2(&u1,&o1,*xj1, n1, m);
2644 
2645  *fj=0;
2646 
2647  if (u0 < o0)
2648  if(u1 < o1)
2649  for(l0=0; l0<=2*m+1; l0++,psij0++)
2650  {
2651  psij1=psij_const1;
2652  gj=g+(u0+l0)*n1+u1;
2653  for(l1=0; l1<=2*m+1; l1++)
2654  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2655  }
2656  else
2657  for(l0=0; l0<=2*m+1; l0++,psij0++)
2658  {
2659  psij1=psij_const1;
2660  gj=g+(u0+l0)*n1+u1;
2661  for(l1=0; l1<2*m+1-o1; l1++)
2662  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2663  gj=g+(u0+l0)*n1;
2664  for(l1=0; l1<=o1; l1++)
2665  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2666  }
2667  else
2668  if(u1<o1)
2669  {
2670  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2671  {
2672  psij1=psij_const1;
2673  gj=g+(u0+l0)*n1+u1;
2674  for(l1=0; l1<=2*m+1; l1++)
2675  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2676  }
2677  for(l0=0; l0<=o0; l0++,psij0++)
2678  {
2679  psij1=psij_const1;
2680  gj=g+l0*n1+u1;
2681  for(l1=0; l1<=2*m+1; l1++)
2682  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2683  }
2684  }
2685  else
2686  {
2687  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2688  {
2689  psij1=psij_const1;
2690  gj=g+(u0+l0)*n1+u1;
2691  for(l1=0; l1<2*m+1-o1; l1++)
2692  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2693  gj=g+(u0+l0)*n1;
2694  for(l1=0; l1<=o1; l1++)
2695  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2696  }
2697  for(l0=0; l0<=o0; l0++,psij0++)
2698  {
2699  psij1=psij_const1;
2700  gj=g+l0*n1+u1;
2701  for(l1=0; l1<2*m+1-o1; l1++)
2702  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2703  gj=g+l0*n1;
2704  for(l1=0; l1<=o1; l1++)
2705  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2706  }
2707  }
2708 }
2709 
2710 #ifdef _OPENMP
2711 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */
2712 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
2713  const R *psij_const0, const R *psij_const1, const R *xj0,
2714  const R *xj1, const INT n0, const INT n1, const INT m)
2715 {
2716  INT u0,o0,l0,u1,o1,l1;
2717  const INT lprod = (2*m+2) * (2*m+2);
2718 
2719  INT index_temp0[2*m+2];
2720  INT index_temp1[2*m+2];
2721 
2722  uo2(&u0,&o0,*xj0, n0, m);
2723  uo2(&u1,&o1,*xj1, n1, m);
2724 
2725  for (l0=0; l0<=2*m+1; l0++)
2726  index_temp0[l0] = (u0+l0)%n0;
2727 
2728  for (l1=0; l1<=2*m+1; l1++)
2729  index_temp1[l1] = (u1+l1)%n1;
2730 
2731  for(l0=0; l0<=2*m+1; l0++)
2732  {
2733  for(l1=0; l1<=2*m+1; l1++)
2734  {
2735  INT i = index_temp0[l0] * n1 + index_temp1[l1];
2736  C *lhs = g+i;
2737  R *lhs_real = (R*)lhs;
2738  C val = psij_const0[l0] * psij_const1[l1] * f;
2739 
2740  #pragma omp atomic
2741  lhs_real[0] += CREAL(val);
2742 
2743  #pragma omp atomic
2744  lhs_real[1] += CIMAG(val);
2745  }
2746  }
2747 }
2748 #endif
2749 
2750 #ifdef _OPENMP
2751 
2769 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
2770  const R *psij_const0, const R *psij_const1, const R *xj0,
2771  const R *xj1, const INT n0, const INT n1, const INT m,
2772  const INT my_u0, const INT my_o0)
2773 {
2774  INT ar_u0,ar_o0,l0,u1,o1,l1;
2775  const INT lprod = (2*m+2) * (2*m+2);
2776  INT index_temp1[2*m+2];
2777 
2778  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2779  uo2(&u1,&o1,*xj1, n1, m);
2780 
2781  for (l1 = 0; l1 <= 2*m+1; l1++)
2782  index_temp1[l1] = (u1+l1)%n1;
2783 
2784  if(ar_u0 < ar_o0)
2785  {
2786  INT u0 = MAX(my_u0,ar_u0);
2787  INT o0 = MIN(my_o0,ar_o0);
2788  INT offset_psij = u0-ar_u0;
2789 #ifdef OMP_ASSERT
2790  assert(offset_psij >= 0);
2791  assert(o0-u0 <= 2*m+1);
2792  assert(offset_psij+o0-u0 <= 2*m+1);
2793 #endif
2794 
2795  for (l0 = 0; l0 <= o0-u0; l0++)
2796  {
2797  INT i0 = (u0+l0) * n1;
2798  const C val0 = psij_const0[offset_psij+l0];
2799 
2800  for(l1=0; l1<=2*m+1; l1++)
2801  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2802  }
2803  }
2804  else
2805  {
2806  INT u0 = MAX(my_u0,ar_u0);
2807  INT o0 = my_o0;
2808  INT offset_psij = u0-ar_u0;
2809 #ifdef OMP_ASSERT
2810  assert(offset_psij >= 0);
2811  assert(o0-u0 <= 2*m+1);
2812  assert(offset_psij+o0-u0 <= 2*m+1);
2813 #endif
2814 
2815  for (l0 = 0; l0 <= o0-u0; l0++)
2816  {
2817  INT i0 = (u0+l0) * n1;
2818  const C val0 = psij_const0[offset_psij+l0];
2819 
2820  for(l1=0; l1<=2*m+1; l1++)
2821  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2822  }
2823 
2824  u0 = my_u0;
2825  o0 = MIN(my_o0,ar_o0);
2826  offset_psij += my_u0-ar_u0+n0;
2827 
2828 #ifdef OMP_ASSERT
2829  if (u0<=o0)
2830  {
2831  assert(o0-u0 <= 2*m+1);
2832  assert(offset_psij+o0-u0 <= 2*m+1);
2833  }
2834 #endif
2835 
2836  for (l0 = 0; l0 <= o0-u0; l0++)
2837  {
2838  INT i0 = (u0+l0) * n1;
2839  const C val0 = psij_const0[offset_psij+l0];
2840 
2841  for(l1=0; l1<=2*m+1; l1++)
2842  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2843  }
2844  }
2845 }
2846 #endif
2847 
2848 #ifndef _OPENMP
2849 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
2850  const R *psij_const0, const R *psij_const1, const R *xj0,
2851  const R *xj1, const INT n0, const INT n1, const INT m)
2852 {
2853  INT u0,o0,l0,u1,o1,l1;
2854  C *gj;
2855  const R *psij0,*psij1;
2856 
2857  psij0=psij_const0;
2858  psij1=psij_const1;
2859 
2860  uo2(&u0,&o0,*xj0, n0, m);
2861  uo2(&u1,&o1,*xj1, n1, m);
2862 
2863  if(u0<o0)
2864  if(u1<o1)
2865  for(l0=0; l0<=2*m+1; l0++,psij0++)
2866  {
2867  psij1=psij_const1;
2868  gj=g+(u0+l0)*n1+u1;
2869  for(l1=0; l1<=2*m+1; l1++)
2870  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2871  }
2872  else
2873  for(l0=0; l0<=2*m+1; l0++,psij0++)
2874  {
2875  psij1=psij_const1;
2876  gj=g+(u0+l0)*n1+u1;
2877  for(l1=0; l1<2*m+1-o1; l1++)
2878  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2879  gj=g+(u0+l0)*n1;
2880  for(l1=0; l1<=o1; l1++)
2881  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2882  }
2883  else
2884  if(u1<o1)
2885  {
2886  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2887  {
2888  psij1=psij_const1;
2889  gj=g+(u0+l0)*n1+u1;
2890  for(l1=0; l1<=2*m+1; l1++)
2891  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2892  }
2893  for(l0=0; l0<=o0; l0++,psij0++)
2894  {
2895  psij1=psij_const1;
2896  gj=g+l0*n1+u1;
2897  for(l1=0; l1<=2*m+1; l1++)
2898  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2899  }
2900  }
2901  else
2902  {
2903  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2904  {
2905  psij1=psij_const1;
2906  gj=g+(u0+l0)*n1+u1;
2907  for(l1=0; l1<2*m+1-o1; l1++)
2908  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2909  gj=g+(u0+l0)*n1;
2910  for(l1=0; l1<=o1; l1++)
2911  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2912  }
2913  for(l0=0; l0<=o0; l0++,psij0++)
2914  {
2915  psij1=psij_const1;
2916  gj=g+l0*n1+u1;
2917  for(l1=0; l1<2*m+1-o1; l1++)
2918  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2919  gj=g+l0*n1;
2920  for(l1=0; l1<=o1; l1++)
2921  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2922  }
2923  }
2924 }
2925 #endif
2926 
2927 static void nfft_trafo_2d_B(X(plan) *ths)
2928 {
2929  const C *g = (C*)ths->g;
2930  const INT n0 = ths->n[0];
2931  const INT n1 = ths->n[1];
2932  const INT M = ths->M_total;
2933  const INT m = ths->m;
2934 
2935  INT k;
2936 
2937  if(ths->flags & PRE_FULL_PSI)
2938  {
2939  const INT lprod = (2*m+2) * (2*m+2);
2940 #ifdef _OPENMP
2941  #pragma omp parallel for default(shared) private(k)
2942 #endif
2943  for (k = 0; k < M; k++)
2944  {
2945  INT l;
2946  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2947  ths->f[j] = K(0.0);
2948  for (l = 0; l < lprod; l++)
2949  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
2950  }
2951  return;
2952  } /* if(PRE_FULL_PSI) */
2953 
2954  if(ths->flags & PRE_PSI)
2955  {
2956 #ifdef _OPENMP
2957  #pragma omp parallel for default(shared) private(k)
2958 #endif
2959  for (k = 0; k < M; k++)
2960  {
2961  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2962  nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2963  }
2964 
2965  return;
2966  } /* if(PRE_PSI) */
2967 
2968  if(ths->flags & PRE_FG_PSI)
2969  {
2970  R fg_exp_l[2*(2*m+2)];
2971 
2972  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2973  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
2974 
2975 #ifdef _OPENMP
2976  #pragma omp parallel for default(shared) private(k)
2977 #endif
2978  for (k = 0; k < M; k++)
2979  {
2980  R psij_const[2*(2*m+2)];
2981  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2982  INT l;
2983  R fg_psij0 = ths->psi[2*j*2];
2984  R fg_psij1 = ths->psi[2*j*2+1];
2985  R fg_psij2 = K(1.0);
2986 
2987  psij_const[0] = fg_psij0;
2988  for (l = 1; l <= 2*m+1; l++)
2989  {
2990  fg_psij2 *= fg_psij1;
2991  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
2992  }
2993 
2994  fg_psij0 = ths->psi[2*(j*2+1)];
2995  fg_psij1 = ths->psi[2*(j*2+1)+1];
2996  fg_psij2 = K(1.0);
2997  psij_const[2*m+2] = fg_psij0;
2998  for (l = 1; l <= 2*m+1; l++)
2999  {
3000  fg_psij2 *= fg_psij1;
3001  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3002  }
3003 
3004  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3005  }
3006 
3007  return;
3008  } /* if(PRE_FG_PSI) */
3009 
3010  if(ths->flags & FG_PSI)
3011  {
3012  R fg_exp_l[2*(2*m+2)];
3013 
3014  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3015  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3016 
3017  sort(ths);
3018 
3019 #ifdef _OPENMP
3020  #pragma omp parallel for default(shared) private(k)
3021 #endif
3022  for (k = 0; k < M; k++)
3023  {
3024  INT u, o, l;
3025  R fg_psij0, fg_psij1, fg_psij2;
3026  R psij_const[2*(2*m+2)];
3027  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3028 
3029  uo(ths, j, &u, &o, (INT)0);
3030  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3031  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3032  fg_psij2 = K(1.0);
3033  psij_const[0] = fg_psij0;
3034  for (l = 1; l <= 2*m+1; l++)
3035  {
3036  fg_psij2 *= fg_psij1;
3037  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3038  }
3039 
3040  uo(ths,j,&u,&o, (INT)1);
3041  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3042  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3043  fg_psij2 = K(1.0);
3044  psij_const[2*m+2] = fg_psij0;
3045  for(l=1; l<=2*m+1; l++)
3046  {
3047  fg_psij2 *= fg_psij1;
3048  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3049  }
3050 
3051  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3052  }
3053 
3054  return;
3055  } /* if(FG_PSI) */
3056 
3057  if(ths->flags & PRE_LIN_PSI)
3058  {
3059  const INT K = ths->K, ip_s = K / (m + 2);
3060 
3061  sort(ths);
3062 
3063 #ifdef _OPENMP
3064  #pragma omp parallel for default(shared) private(k)
3065 #endif
3066  for (k = 0; k < M; k++)
3067  {
3068  INT u, o, l;
3069  R ip_y, ip_w;
3070  INT ip_u;
3071  R psij_const[2*(2*m+2)];
3072  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3073 
3074  uo(ths,j,&u,&o,(INT)0);
3075  ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3076  ip_u = (INT)LRINT(FLOOR(ip_y));
3077  ip_w = ip_y - (R)(ip_u);
3078  for (l = 0; l < 2*m+2; l++)
3079  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3080 
3081  uo(ths,j,&u,&o,(INT)1);
3082  ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3083  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3084  ip_w = ip_y - (R)(ip_u);
3085  for (l = 0; l < 2*m+2; l++)
3086  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3087 
3088  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3089  }
3090  return;
3091  } /* if(PRE_LIN_PSI) */
3092 
3093  /* no precomputed psi at all */
3094 
3095  sort(ths);
3096 
3097 #ifdef _OPENMP
3098  #pragma omp parallel for default(shared) private(k)
3099 #endif
3100  for (k = 0; k < M; k++)
3101  {
3102  R psij_const[2*(2*m+2)];
3103  INT u, o, l;
3104  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3105 
3106  uo(ths,j,&u,&o,(INT)0);
3107  for (l = 0; l <= 2*m+1; l++)
3108  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3109 
3110  uo(ths,j,&u,&o,(INT)1);
3111  for (l = 0; l <= 2*m+1; l++)
3112  psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3113 
3114  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3115  }
3116 }
3117 
3118 #ifdef OMP_ASSERT
3119 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3120 { \
3121  assert(ar_x[2*k] >= min_u_a || k == M-1); \
3122  if (k > 0) \
3123  assert(ar_x[2*k-2] < min_u_a); \
3124 }
3125 #else
3126 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3127 #endif
3128 
3129 #ifdef OMP_ASSERT
3130 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3131 { \
3132  assert(ar_x[2*k] >= min_u_b || k == M-1); \
3133  if (k > 0) \
3134  assert(ar_x[2*k-2] < min_u_b); \
3135 }
3136 #else
3137 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3138 #endif
3139 
3140 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3141  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3142  ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3143  ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3144 
3145 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3146 { \
3147  R psij_const[2*(2*m+2)]; \
3148  INT u, o, l; \
3149  R fg_psij0 = ths->psi[2*j*2]; \
3150  R fg_psij1 = ths->psi[2*j*2+1]; \
3151  R fg_psij2 = K(1.0); \
3152  \
3153  psij_const[0] = fg_psij0; \
3154  for(l=1; l<=2*m+1; l++) \
3155  { \
3156  fg_psij2 *= fg_psij1; \
3157  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3158  } \
3159  \
3160  fg_psij0 = ths->psi[2*(j*2+1)]; \
3161  fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3162  fg_psij2 = K(1.0); \
3163  psij_const[2*m+2] = fg_psij0; \
3164  for(l=1; l<=2*m+1; l++) \
3165  { \
3166  fg_psij2 *= fg_psij1; \
3167  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3168  } \
3169  \
3170  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3171  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3172  n0, n1, m, my_u0, my_o0); \
3173 }
3174 
3175 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3176 { \
3177  R psij_const[2*(2*m+2)]; \
3178  R fg_psij0, fg_psij1, fg_psij2; \
3179  INT u, o, l; \
3180  \
3181  uo(ths,j,&u,&o,(INT)0); \
3182  fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \
3183  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3184  fg_psij2 = K(1.0); \
3185  psij_const[0] = fg_psij0; \
3186  for(l=1; l<=2*m+1; l++) \
3187  { \
3188  fg_psij2 *= fg_psij1; \
3189  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3190  } \
3191  \
3192  uo(ths,j,&u,&o,(INT)1); \
3193  fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \
3194  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3195  fg_psij2 = K(1.0); \
3196  psij_const[2*m+2] = fg_psij0; \
3197  for(l=1; l<=2*m+1; l++) \
3198  { \
3199  fg_psij2 *= fg_psij1; \
3200  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3201  } \
3202  \
3203  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3204  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3205  n0, n1, m, my_u0, my_o0); \
3206 }
3207 
3208 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3209 { \
3210  R psij_const[2*(2*m+2)]; \
3211  INT u, o, l; \
3212  INT ip_u; \
3213  R ip_y, ip_w; \
3214  \
3215  uo(ths,j,&u,&o,(INT)0); \
3216  ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3217  ip_u = LRINT(FLOOR(ip_y)); \
3218  ip_w = ip_y-ip_u; \
3219  for(l=0; l < 2*m+2; l++) \
3220  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3221  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3222  \
3223  uo(ths,j,&u,&o,(INT)1); \
3224  ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3225  ip_u = LRINT(FLOOR(ip_y)); \
3226  ip_w = ip_y-ip_u; \
3227  for(l=0; l < 2*m+2; l++) \
3228  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3229  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3230  \
3231  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3232  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3233  n0, n1, m, my_u0, my_o0); \
3234 }
3235 
3236 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3237 { \
3238  R psij_const[2*(2*m+2)]; \
3239  INT u, o, l; \
3240  \
3241  uo(ths,j,&u,&o,(INT)0); \
3242  for(l=0;l<=2*m+1;l++) \
3243  psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \
3244  \
3245  uo(ths,j,&u,&o,(INT)1); \
3246  for(l=0;l<=2*m+1;l++) \
3247  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3248  \
3249  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3250  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3251  n0, n1, m, my_u0, my_o0); \
3252 }
3253 
3254 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3255 { \
3256  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3257  { \
3258  _Pragma("omp parallel private(k)") \
3259  { \
3260  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3261  INT *ar_x = ths->index_x; \
3262  \
3263  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3264  &min_u_b, &max_u_b, 2, ths->n, m); \
3265  \
3266  if (min_u_a != -1) \
3267  { \
3268  k = index_x_binary_search(ar_x, M, min_u_a); \
3269  \
3270  MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3271  \
3272  while (k < M) \
3273  { \
3274  INT u_prod = ar_x[2*k]; \
3275  INT j = ar_x[2*k+1]; \
3276  \
3277  if (u_prod < min_u_a || u_prod > max_u_a) \
3278  break; \
3279  \
3280  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3281  \
3282  k++; \
3283  } \
3284  } \
3285  \
3286  if (min_u_b != -1) \
3287  { \
3288  INT k = index_x_binary_search(ar_x, M, min_u_b); \
3289  \
3290  MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3291  \
3292  while (k < M) \
3293  { \
3294  INT u_prod = ar_x[2*k]; \
3295  INT j = ar_x[2*k+1]; \
3296  \
3297  if (u_prod < min_u_b || u_prod > max_u_b) \
3298  break; \
3299  \
3300  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3301  \
3302  k++; \
3303  } \
3304  } \
3305  } /* omp parallel */ \
3306  return; \
3307  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
3308 }
3309 
3310 
3311 static void nfft_adjoint_2d_B(X(plan) *ths)
3312 {
3313  const INT n0 = ths->n[0];
3314  const INT n1 = ths->n[1];
3315  const INT M = ths->M_total;
3316  const INT m = ths->m;
3317  C* g = (C*) ths->g;
3318  INT k;
3319 
3320  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
3321 
3322  if(ths->flags & PRE_FULL_PSI)
3323  {
3324  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3325  (INT)2, ths->n, m, ths->flags, ths->index_x);
3326  return;
3327  } /* if(PRE_FULL_PSI) */
3328 
3329  if(ths->flags & PRE_PSI)
3330  {
3331 #ifdef _OPENMP
3332  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3333 #endif
3334 
3335 #ifdef _OPENMP
3336  #pragma omp parallel for default(shared) private(k)
3337 #endif
3338  for (k = 0; k < M; k++)
3339  {
3340  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3341 #ifdef _OPENMP
3342  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3343 #else
3344  nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3345 #endif
3346  }
3347  return;
3348  } /* if(PRE_PSI) */
3349 
3350  if(ths->flags & PRE_FG_PSI)
3351  {
3352  R fg_exp_l[2*(2*m+2)];
3353 
3354  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3355  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3356 
3357 #ifdef _OPENMP
3358  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3359 #endif
3360 
3361 
3362 #ifdef _OPENMP
3363  #pragma omp parallel for default(shared) private(k)
3364 #endif
3365  for (k = 0; k < M; k++)
3366  {
3367  R psij_const[2*(2*m+2)];
3368  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3369  INT l;
3370  R fg_psij0 = ths->psi[2*j*2];
3371  R fg_psij1 = ths->psi[2*j*2+1];
3372  R fg_psij2 = K(1.0);
3373 
3374  psij_const[0] = fg_psij0;
3375  for(l=1; l<=2*m+1; l++)
3376  {
3377  fg_psij2 *= fg_psij1;
3378  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3379  }
3380 
3381  fg_psij0 = ths->psi[2*(j*2+1)];
3382  fg_psij1 = ths->psi[2*(j*2+1)+1];
3383  fg_psij2 = K(1.0);
3384  psij_const[2*m+2] = fg_psij0;
3385  for(l=1; l<=2*m+1; l++)
3386  {
3387  fg_psij2 *= fg_psij1;
3388  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3389  }
3390 
3391 #ifdef _OPENMP
3392  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3393 #else
3394  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3395 #endif
3396  }
3397 
3398  return;
3399  } /* if(PRE_FG_PSI) */
3400 
3401  if(ths->flags & FG_PSI)
3402  {
3403  R fg_exp_l[2*(2*m+2)];
3404 
3405  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3406  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3407 
3408  sort(ths);
3409 
3410 #ifdef _OPENMP
3411  MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3412 #endif
3413 
3414 #ifdef _OPENMP
3415  #pragma omp parallel for default(shared) private(k)
3416 #endif
3417  for (k = 0; k < M; k++)
3418  {
3419  INT u, o, l;
3420  R fg_psij0, fg_psij1, fg_psij2;
3421  R psij_const[2*(2*m+2)];
3422  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3423 
3424  uo(ths,j,&u,&o,(INT)0);
3425  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3426  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3427  fg_psij2 = K(1.0);
3428  psij_const[0] = fg_psij0;
3429  for(l=1; l<=2*m+1; l++)
3430  {
3431  fg_psij2 *= fg_psij1;
3432  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3433  }
3434 
3435  uo(ths,j,&u,&o,(INT)1);
3436  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3437  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3438  fg_psij2 = K(1.0);
3439  psij_const[2*m+2] = fg_psij0;
3440  for(l=1; l<=2*m+1; l++)
3441  {
3442  fg_psij2 *= fg_psij1;
3443  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3444  }
3445 
3446 #ifdef _OPENMP
3447  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3448 #else
3449  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3450 #endif
3451  }
3452 
3453  return;
3454  } /* if(FG_PSI) */
3455 
3456  if(ths->flags & PRE_LIN_PSI)
3457  {
3458  const INT K = ths->K;
3459  const INT ip_s = K / (m + 2);
3460 
3461  sort(ths);
3462 
3463 #ifdef _OPENMP
3464  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3465 #endif
3466 
3467 #ifdef _OPENMP
3468  #pragma omp parallel for default(shared) private(k)
3469 #endif
3470  for (k = 0; k < M; k++)
3471  {
3472  INT u,o,l;
3473  INT ip_u;
3474  R ip_y, ip_w;
3475  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3476  R psij_const[2*(2*m+2)];
3477 
3478  uo(ths,j,&u,&o,(INT)0);
3479  ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3480  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3481  ip_w = ip_y - (R)(ip_u);
3482  for(l=0; l < 2*m+2; l++)
3483  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3484  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3485 
3486  uo(ths,j,&u,&o,(INT)1);
3487  ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3488  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3489  ip_w = ip_y - (R)(ip_u);
3490  for(l=0; l < 2*m+2; l++)
3491  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3492  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3493 
3494 #ifdef _OPENMP
3495  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3496 #else
3497  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3498 #endif
3499  }
3500  return;
3501  } /* if(PRE_LIN_PSI) */
3502 
3503  /* no precomputed psi at all */
3504  sort(ths);
3505 
3506 #ifdef _OPENMP
3507  MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3508 #endif
3509 
3510 #ifdef _OPENMP
3511  #pragma omp parallel for default(shared) private(k)
3512 #endif
3513  for (k = 0; k < M; k++)
3514  {
3515  INT u,o,l;
3516  R psij_const[2*(2*m+2)];
3517  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3518 
3519  uo(ths,j,&u,&o,(INT)0);
3520  for(l=0;l<=2*m+1;l++)
3521  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3522 
3523  uo(ths,j,&u,&o,(INT)1);
3524  for(l=0;l<=2*m+1;l++)
3525  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3526 
3527 #ifdef _OPENMP
3528  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3529 #else
3530  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3531 #endif
3532  }
3533 }
3534 
3535 
3536 void X(trafo_2d)(X(plan) *ths)
3537 {
3538  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3539  {
3540  X(trafo_direct)(ths);
3541  return;
3542  }
3543 
3544  INT k0,k1,n0,n1,N0,N1;
3545  C *g_hat,*f_hat;
3546  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3547  R ck01, ck02, ck11, ck12;
3548  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3549 
3550  ths->g_hat=ths->g1;
3551  ths->g=ths->g2;
3552 
3553  N0=ths->N[0];
3554  N1=ths->N[1];
3555  n0=ths->n[0];
3556  n1=ths->n[1];
3557 
3558  f_hat=(C*)ths->f_hat;
3559  g_hat=(C*)ths->g_hat;
3560 
3561  TIC(0)
3562 #ifdef _OPENMP
3563  #pragma omp parallel for default(shared) private(k0)
3564  for (k0 = 0; k0 < ths->n_total; k0++)
3565  ths->g_hat[k0] = 0.0;
3566 #else
3567  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
3568 #endif
3569  if(ths->flags & PRE_PHI_HUT)
3570  {
3571  c_phi_inv01=ths->c_phi_inv[0];
3572  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3573 
3574 #ifdef _OPENMP
3575  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3576 #endif
3577  for(k0=0;k0<N0/2;k0++)
3578  {
3579  ck01=c_phi_inv01[k0];
3580  ck02=c_phi_inv02[k0];
3581 
3582  c_phi_inv11=ths->c_phi_inv[1];
3583  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3584 
3585  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3586  f_hat11=f_hat + k0*N1;
3587  g_hat21=g_hat + k0*n1+n1-(N1/2);
3588  f_hat21=f_hat + ((N0/2)+k0)*N1;
3589  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3590  f_hat12=f_hat + k0*N1+(N1/2);
3591  g_hat22=g_hat + k0*n1;
3592  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3593 
3594  for(k1=0;k1<N1/2;k1++)
3595  {
3596  ck11=c_phi_inv11[k1];
3597  ck12=c_phi_inv12[k1];
3598 
3599  g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3600  g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3601  g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3602  g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3603  }
3604  }
3605  }
3606  else
3607 #ifdef _OPENMP
3608  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3609 #endif
3610  for(k0=0;k0<N0/2;k0++)
3611  {
3612  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3613  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3614  for(k1=0;k1<N1/2;k1++)
3615  {
3616  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3617  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3618  g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3619  g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3620  g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3621  g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3622  }
3623  }
3624 
3625  TOC(0)
3626 
3627  TIC_FFTW(1)
3628  FFTW(execute)(ths->my_fftw_plan1);
3629  TOC_FFTW(1);
3630 
3631  TIC(2);
3632  nfft_trafo_2d_B(ths);
3633  TOC(2);
3634 }
3635 
3636 void X(adjoint_2d)(X(plan) *ths)
3637 {
3638  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3639  {
3640  X(adjoint_direct)(ths);
3641  return;
3642  }
3643 
3644  INT k0,k1,n0,n1,N0,N1;
3645  C *g_hat,*f_hat;
3646  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3647  R ck01, ck02, ck11, ck12;
3648  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3649 
3650  ths->g_hat=ths->g1;
3651  ths->g=ths->g2;
3652 
3653  N0=ths->N[0];
3654  N1=ths->N[1];
3655  n0=ths->n[0];
3656  n1=ths->n[1];
3657 
3658  f_hat=(C*)ths->f_hat;
3659  g_hat=(C*)ths->g_hat;
3660 
3661  TIC(2);
3662  nfft_adjoint_2d_B(ths);
3663  TOC(2);
3664 
3665  TIC_FFTW(1)
3666  FFTW(execute)(ths->my_fftw_plan2);
3667  TOC_FFTW(1);
3668 
3669  TIC(0)
3670  if(ths->flags & PRE_PHI_HUT)
3671  {
3672  c_phi_inv01=ths->c_phi_inv[0];
3673  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3674 
3675 #ifdef _OPENMP
3676  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3677 #endif
3678  for(k0=0;k0<N0/2;k0++)
3679  {
3680  ck01=c_phi_inv01[k0];
3681  ck02=c_phi_inv02[k0];
3682 
3683  c_phi_inv11=ths->c_phi_inv[1];
3684  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3685 
3686  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3687  f_hat11=f_hat + k0*N1;
3688  g_hat21=g_hat + k0*n1+n1-(N1/2);
3689  f_hat21=f_hat + ((N0/2)+k0)*N1;
3690  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3691  f_hat12=f_hat + k0*N1+(N1/2);
3692  g_hat22=g_hat + k0*n1;
3693  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3694 
3695  for(k1=0;k1<N1/2;k1++)
3696  {
3697  ck11=c_phi_inv11[k1];
3698  ck12=c_phi_inv12[k1];
3699 
3700  f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3701  f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3702  f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3703  f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3704  }
3705  }
3706  }
3707  else
3708 #ifdef _OPENMP
3709  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3710 #endif
3711  for(k0=0;k0<N0/2;k0++)
3712  {
3713  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3714  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3715  for(k1=0;k1<N1/2;k1++)
3716  {
3717  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3718  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3719  f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3720  f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3721  f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3722  f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3723  }
3724  }
3725  TOC(0)
3726 }
3727 
3728 /* ################################################ SPECIFIC VERSIONS FOR d=3 */
3729 
3730 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
3731 {
3732  INT l;
3733  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3734 
3735  fg_exp_b0 = EXP(-K(1.0) / b);
3736  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3737  fg_exp_b1 = K(1.0);
3738  fg_exp_b2 = K(1.0);
3739  fg_exp_l[0] = K(1.0);
3740  for(l=1; l <= 2*m+1; l++)
3741  {
3742  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3743  fg_exp_b1 *= fg_exp_b0_sq;
3744  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3745  }
3746 }
3747 
3748 static void nfft_trafo_3d_compute(C *fj, const C *g, const R *psij_const0,
3749  const R *psij_const1, const R *psij_const2, const R *xj0, const R *xj1,
3750  const R *xj2, const INT n0, const INT n1, const INT n2, const INT m)
3751 {
3752  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
3753  const C *gj;
3754  const R *psij0, *psij1, *psij2;
3755 
3756  psij0 = psij_const0;
3757  psij1 = psij_const1;
3758  psij2 = psij_const2;
3759 
3760  uo2(&u0, &o0, *xj0, n0, m);
3761  uo2(&u1, &o1, *xj1, n1, m);
3762  uo2(&u2, &o2, *xj2, n2, m);
3763 
3764  *fj = 0;
3765 
3766  if (u0 < o0)
3767  if (u1 < o1)
3768  if (u2 < o2)
3769  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3770  {
3771  psij1 = psij_const1;
3772  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3773  {
3774  psij2 = psij_const2;
3775  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3776  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3777  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3778  }
3779  }
3780  else
3781  /* asserts (u2>o2)*/
3782  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3783  {
3784  psij1 = psij_const1;
3785  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3786  {
3787  psij2 = psij_const2;
3788  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3789  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3790  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3791  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3792  for (l2 = 0; l2 <= o2; l2++)
3793  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3794  }
3795  }
3796  else /* asserts (u1>o1)*/
3797  if (u2 < o2)
3798  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3799  {
3800  psij1 = psij_const1;
3801  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3802  {
3803  psij2 = psij_const2;
3804  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3805  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3806  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3807  }
3808  for (l1 = 0; l1 <= o1; l1++, psij1++)
3809  {
3810  psij2 = psij_const2;
3811  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3812  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3813  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3814  }
3815  }
3816  else/* asserts (u2>o2) */
3817  {
3818  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3819  {
3820  psij1 = psij_const1;
3821  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3822  {
3823  psij2 = psij_const2;
3824  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3825  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3826  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3827  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3828  for (l2 = 0; l2 <= o2; l2++)
3829  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3830  }
3831  for (l1 = 0; l1 <= o1; l1++, psij1++)
3832  {
3833  psij2 = psij_const2;
3834  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3835  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3836  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3837  gj = g + ((u0 + l0) * n1 + l1) * n2;
3838  for (l2 = 0; l2 <= o2; l2++)
3839  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3840  }
3841  }
3842  }
3843  else /* asserts (u0>o0) */
3844  if (u1 < o1)
3845  if (u2 < o2)
3846  {
3847  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3848  {
3849  psij1 = psij_const1;
3850  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3851  {
3852  psij2 = psij_const2;
3853  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3854  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3855  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3856  }
3857  }
3858 
3859  for (l0 = 0; l0 <= o0; l0++, psij0++)
3860  {
3861  psij1 = psij_const1;
3862  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3863  {
3864  psij2 = psij_const2;
3865  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3866  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3867  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3868  }
3869  }
3870  } else/* asserts (u2>o2) */
3871  {
3872  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3873  {
3874  psij1 = psij_const1;
3875  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3876  {
3877  psij2 = psij_const2;
3878  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3879  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3880  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3881  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3882  for (l2 = 0; l2 <= o2; l2++)
3883  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3884  }
3885  }
3886 
3887  for (l0 = 0; l0 <= o0; l0++, psij0++)
3888  {
3889  psij1 = psij_const1;
3890  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3891  {
3892  psij2 = psij_const2;
3893  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3894  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3895  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3896  gj = g + (l0 * n1 + (u1 + l1)) * n2;
3897  for (l2 = 0; l2 <= o2; l2++)
3898  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3899  }
3900  }
3901  }
3902  else /* asserts (u1>o1) */
3903  if (u2 < o2)
3904  {
3905  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3906  {
3907  psij1 = psij_const1;
3908  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3909  {
3910  psij2 = psij_const2;
3911  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3912  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3913  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3914  }
3915  for (l1 = 0; l1 <= o1; l1++, psij1++)
3916  {
3917  psij2 = psij_const2;
3918  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3919  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3920  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3921  }
3922  }
3923  for (l0 = 0; l0 <= o0; l0++, psij0++)
3924  {
3925  psij1 = psij_const1;
3926  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3927  {
3928  psij2 = psij_const2;
3929  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3930  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3931  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3932  }
3933  for (l1 = 0; l1 <= o1; l1++, psij1++)
3934  {
3935  psij2 = psij_const2;
3936  gj = g + (l0 * n1 + l1) * n2 + u2;
3937  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3938  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3939  }
3940  }
3941  } else/* asserts (u2>o2) */
3942  {
3943  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3944  {
3945  psij1 = psij_const1;
3946  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3947  {
3948  psij2 = psij_const2;
3949  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3950  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3951  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3952  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3953  for (l2 = 0; l2 <= o2; l2++)
3954  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3955  }
3956  for (l1 = 0; l1 <= o1; l1++, psij1++)
3957  {
3958  psij2 = psij_const2;
3959  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3960  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3961  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3962  gj = g + ((u0 + l0) * n1 + l1) * n2;
3963  for (l2 = 0; l2 <= o2; l2++)
3964  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3965  }
3966  }
3967 
3968  for (l0 = 0; l0 <= o0; l0++, psij0++)
3969  {
3970  psij1 = psij_const1;
3971  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3972  {
3973  psij2 = psij_const2;
3974  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3975  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3976  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3977  gj = g + (l0 * n1 + (u1 + l1)) * n2;
3978  for (l2 = 0; l2 <= o2; l2++)
3979  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3980  }
3981  for (l1 = 0; l1 <= o1; l1++, psij1++)
3982  {
3983  psij2 = psij_const2;
3984  gj = g + (l0 * n1 + l1) * n2 + u2;
3985  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3986  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3987  gj = g + (l0 * n1 + l1) * n2;
3988  for (l2 = 0; l2 <= o2; l2++)
3989  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3990  }
3991  }
3992  }
3993 }
3994 
3995 #ifdef _OPENMP
3996 
4017 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
4018  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4019  const R *xj0, const R *xj1, const R *xj2,
4020  const INT n0, const INT n1, const INT n2, const INT m,
4021  const INT my_u0, const INT my_o0)
4022 {
4023  INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4024  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4025 
4026  INT index_temp1[2*m+2];
4027  INT index_temp2[2*m+2];
4028 
4029  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4030  uo2(&u1,&o1,*xj1, n1, m);
4031  uo2(&u2,&o2,*xj2, n2, m);
4032 
4033  for (l1=0; l1<=2*m+1; l1++)
4034  index_temp1[l1] = (u1+l1)%n1;
4035 
4036  for (l2=0; l2<=2*m+1; l2++)
4037  index_temp2[l2] = (u2+l2)%n2;
4038 
4039  if(ar_u0<ar_o0)
4040  {
4041  INT u0 = MAX(my_u0,ar_u0);
4042  INT o0 = MIN(my_o0,ar_o0);
4043  INT offset_psij = u0-ar_u0;
4044 #ifdef OMP_ASSERT
4045  assert(offset_psij >= 0);
4046  assert(o0-u0 <= 2*m+1);
4047  assert(offset_psij+o0-u0 <= 2*m+1);
4048 #endif
4049 
4050  for (l0 = 0; l0 <= o0-u0; l0++)
4051  {
4052  const INT i0 = (u0+l0) * n1;
4053  const C val0 = psij_const0[offset_psij+l0];
4054 
4055  for(l1=0; l1<=2*m+1; l1++)
4056  {
4057  const INT i1 = (i0 + index_temp1[l1]) * n2;
4058  const C val1 = psij_const1[l1];
4059 
4060  for(l2=0; l2<=2*m+1; l2++)
4061  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4062  }
4063  }
4064  }
4065  else
4066  {
4067  INT u0 = MAX(my_u0,ar_u0);
4068  INT o0 = my_o0;
4069  INT offset_psij = u0-ar_u0;
4070 #ifdef OMP_ASSERT
4071  assert(offset_psij >= 0);
4072  assert(o0-u0 <= 2*m+1);
4073  assert(offset_psij+o0-u0 <= 2*m+1);
4074 #endif
4075 
4076  for (l0 = 0; l0 <= o0-u0; l0++)
4077  {
4078  INT i0 = (u0+l0) * n1;
4079  const C val0 = psij_const0[offset_psij+l0];
4080 
4081  for(l1=0; l1<=2*m+1; l1++)
4082  {
4083  const INT i1 = (i0 + index_temp1[l1]) * n2;
4084  const C val1 = psij_const1[l1];
4085 
4086  for(l2=0; l2<=2*m+1; l2++)
4087  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4088  }
4089  }
4090 
4091  u0 = my_u0;
4092  o0 = MIN(my_o0,ar_o0);
4093  offset_psij += my_u0-ar_u0+n0;
4094 
4095 #ifdef OMP_ASSERT
4096  if (u0<=o0)
4097  {
4098  assert(o0-u0 <= 2*m+1);
4099  assert(offset_psij+o0-u0 <= 2*m+1);
4100  }
4101 #endif
4102  for (l0 = 0; l0 <= o0-u0; l0++)
4103  {
4104  INT i0 = (u0+l0) * n1;
4105  const C val0 = psij_const0[offset_psij+l0];
4106 
4107  for(l1=0; l1<=2*m+1; l1++)
4108  {
4109  const INT i1 = (i0 + index_temp1[l1]) * n2;
4110  const C val1 = psij_const1[l1];
4111 
4112  for(l2=0; l2<=2*m+1; l2++)
4113  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4114  }
4115  }
4116  }
4117 }
4118 #endif
4119 
4120 #ifdef _OPENMP
4121 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */
4122 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
4123  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4124  const R *xj0, const R *xj1, const R *xj2,
4125  const INT n0, const INT n1, const INT n2, const INT m)
4126 {
4127  INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4128  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4129 
4130  INT index_temp0[2*m+2];
4131  INT index_temp1[2*m+2];
4132  INT index_temp2[2*m+2];
4133 
4134  uo2(&u0,&o0,*xj0, n0, m);
4135  uo2(&u1,&o1,*xj1, n1, m);
4136  uo2(&u2,&o2,*xj2, n2, m);
4137 
4138  for (l0=0; l0<=2*m+1; l0++)
4139  index_temp0[l0] = (u0+l0)%n0;
4140 
4141  for (l1=0; l1<=2*m+1; l1++)
4142  index_temp1[l1] = (u1+l1)%n1;
4143 
4144  for (l2=0; l2<=2*m+1; l2++)
4145  index_temp2[l2] = (u2+l2)%n2;
4146 
4147  for(l0=0; l0<=2*m+1; l0++)
4148  {
4149  for(l1=0; l1<=2*m+1; l1++)
4150  {
4151  for(l2=0; l2<=2*m+1; l2++)
4152  {
4153  INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4154  C *lhs = g+i;
4155  R *lhs_real = (R*)lhs;
4156  C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4157 
4158 #pragma omp atomic
4159  lhs_real[0] += CREAL(val);
4160 
4161 #pragma omp atomic
4162  lhs_real[1] += CIMAG(val);
4163  }
4164  }
4165  }
4166 }
4167 #endif
4168 
4169 #ifndef _OPENMP
4170 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
4171  const R *psij_const0, const R *psij_const1, const R *psij_const2, const R *xj0,
4172  const R *xj1, const R *xj2, const INT n0, const INT n1, const INT n2,
4173  const INT m)
4174 {
4175  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4176  C *gj;
4177  const R *psij0, *psij1, *psij2;
4178 
4179  psij0 = psij_const0;
4180  psij1 = psij_const1;
4181  psij2 = psij_const2;
4182 
4183  uo2(&u0, &o0, *xj0, n0, m);
4184  uo2(&u1, &o1, *xj1, n1, m);
4185  uo2(&u2, &o2, *xj2, n2, m);
4186 
4187  if (u0 < o0)
4188  if (u1 < o1)
4189  if (u2 < o2)
4190  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4191  {
4192  psij1 = psij_const1;
4193  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4194  {
4195  psij2 = psij_const2;
4196  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4197  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4198  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4199  }
4200  }
4201  else
4202  /* asserts (u2>o2)*/
4203  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4204  {
4205  psij1 = psij_const1;
4206  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4207  {
4208  psij2 = psij_const2;
4209  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4210  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4211  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4212  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4213  for (l2 = 0; l2 <= o2; l2++)
4214  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4215  }
4216  }
4217  else /* asserts (u1>o1)*/
4218  if (u2 < o2)
4219  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4220  {
4221  psij1 = psij_const1;
4222  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4223  {
4224  psij2 = psij_const2;
4225  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4226  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4227  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4228  }
4229  for (l1 = 0; l1 <= o1; l1++, psij1++)
4230  {
4231  psij2 = psij_const2;
4232  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4233  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4234  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4235  }
4236  }
4237  else/* asserts (u2>o2) */
4238  {
4239  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4240  {
4241  psij1 = psij_const1;
4242  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4243  {
4244  psij2 = psij_const2;
4245  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4246  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4247  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4248  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4249  for (l2 = 0; l2 <= o2; l2++)
4250  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4251  }
4252  for (l1 = 0; l1 <= o1; l1++, psij1++)
4253  {
4254  psij2 = psij_const2;
4255  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4256  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4257  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4258  gj = g + ((u0 + l0) * n1 + l1) * n2;
4259  for (l2 = 0; l2 <= o2; l2++)
4260  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4261  }
4262  }
4263  }
4264  else /* asserts (u0>o0) */
4265  if (u1 < o1)
4266  if (u2 < o2)
4267  {
4268  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4269  {
4270  psij1 = psij_const1;
4271  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4272  {
4273  psij2 = psij_const2;
4274  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4275  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4276  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4277  }
4278  }
4279 
4280  for (l0 = 0; l0 <= o0; l0++, psij0++)
4281  {
4282  psij1 = psij_const1;
4283  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4284  {
4285  psij2 = psij_const2;
4286  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4287  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4288  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4289  }
4290  }
4291  } else/* asserts (u2>o2) */
4292  {
4293  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4294  {
4295  psij1 = psij_const1;
4296  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4297  {
4298  psij2 = psij_const2;
4299  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4300  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4301  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4302  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4303  for (l2 = 0; l2 <= o2; l2++)
4304  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4305  }
4306  }
4307 
4308  for (l0 = 0; l0 <= o0; l0++, psij0++)
4309  {
4310  psij1 = psij_const1;
4311  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4312  {
4313  psij2 = psij_const2;
4314  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4315  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4316  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4317  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4318  for (l2 = 0; l2 <= o2; l2++)
4319  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4320  }
4321  }
4322  }
4323  else /* asserts (u1>o1) */
4324  if (u2 < o2)
4325  {
4326  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4327  {
4328  psij1 = psij_const1;
4329  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4330  {
4331  psij2 = psij_const2;
4332  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4333  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4334  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4335  }
4336  for (l1 = 0; l1 <= o1; l1++, psij1++)
4337  {
4338  psij2 = psij_const2;
4339  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4340  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4341  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4342  }
4343  }
4344  for (l0 = 0; l0 <= o0; l0++, psij0++)
4345  {
4346  psij1 = psij_const1;
4347  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4348  {
4349  psij2 = psij_const2;
4350  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4351  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4352  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4353  }
4354  for (l1 = 0; l1 <= o1; l1++, psij1++)
4355  {
4356  psij2 = psij_const2;
4357  gj = g + (l0 * n1 + l1) * n2 + u2;
4358  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4359  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4360  }
4361  }
4362  } else/* asserts (u2>o2) */
4363  {
4364  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4365  {
4366  psij1 = psij_const1;
4367  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4368  {
4369  psij2 = psij_const2;
4370  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4371  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4372  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4373  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4374  for (l2 = 0; l2 <= o2; l2++)
4375  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4376  }
4377  for (l1 = 0; l1 <= o1; l1++, psij1++)
4378  {
4379  psij2 = psij_const2;
4380  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4381  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4382  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4383  gj = g + ((u0 + l0) * n1 + l1) * n2;
4384  for (l2 = 0; l2 <= o2; l2++)
4385  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4386  }
4387  }
4388 
4389  for (l0 = 0; l0 <= o0; l0++, psij0++)
4390  {
4391  psij1 = psij_const1;
4392  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4393  {
4394  psij2 = psij_const2;
4395  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4396  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4397  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4398  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4399  for (l2 = 0; l2 <= o2; l2++)
4400  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4401  }
4402  for (l1 = 0; l1 <= o1; l1++, psij1++)
4403  {
4404  psij2 = psij_const2;
4405  gj = g + (l0 * n1 + l1) * n2 + u2;
4406  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4407  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4408  gj = g + (l0 * n1 + l1) * n2;
4409  for (l2 = 0; l2 <= o2; l2++)
4410  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4411  }
4412  }
4413  }
4414 }
4415 #endif
4416 
4417 static void nfft_trafo_3d_B(X(plan) *ths)
4418 {
4419  const INT n0 = ths->n[0];
4420  const INT n1 = ths->n[1];
4421  const INT n2 = ths->n[2];
4422  const INT M = ths->M_total;
4423  const INT m = ths->m;
4424 
4425  const C* g = (C*) ths->g;
4426 
4427  INT k;
4428 
4429  if(ths->flags & PRE_FULL_PSI)
4430  {
4431  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4432 #ifdef _OPENMP
4433  #pragma omp parallel for default(shared) private(k)
4434 #endif
4435  for (k = 0; k < M; k++)
4436  {
4437  INT l;
4438  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4439  ths->f[j] = K(0.0);
4440  for (l = 0; l < lprod; l++)
4441  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4442  }
4443  return;
4444  } /* if(PRE_FULL_PSI) */
4445 
4446  if(ths->flags & PRE_PSI)
4447  {
4448 #ifdef _OPENMP
4449  #pragma omp parallel for default(shared) private(k)
4450 #endif
4451  for (k = 0; k < M; k++)
4452  {
4453  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4454  nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4455  }
4456  return;
4457  } /* if(PRE_PSI) */
4458 
4459  if(ths->flags & PRE_FG_PSI)
4460  {
4461  R fg_exp_l[3*(2*m+2)];
4462 
4463  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4464  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4465  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4466 
4467 #ifdef _OPENMP
4468  #pragma omp parallel for default(shared) private(k)
4469 #endif
4470  for (k = 0; k < M; k++)
4471  {
4472  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4473  INT l;
4474  R psij_const[3*(2*m+2)];
4475  R fg_psij0 = ths->psi[2*j*3];
4476  R fg_psij1 = ths->psi[2*j*3+1];
4477  R fg_psij2 = K(1.0);
4478 
4479  psij_const[0] = fg_psij0;
4480  for(l=1; l<=2*m+1; l++)
4481  {
4482  fg_psij2 *= fg_psij1;
4483  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4484  }
4485 
4486  fg_psij0 = ths->psi[2*(j*3+1)];
4487  fg_psij1 = ths->psi[2*(j*3+1)+1];
4488  fg_psij2 = K(1.0);
4489  psij_const[2*m+2] = fg_psij0;
4490  for(l=1; l<=2*m+1; l++)
4491  {
4492  fg_psij2 *= fg_psij1;
4493  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4494  }
4495 
4496  fg_psij0 = ths->psi[2*(j*3+2)];
4497  fg_psij1 = ths->psi[2*(j*3+2)+1];
4498  fg_psij2 = K(1.0);
4499  psij_const[2*(2*m+2)] = fg_psij0;
4500  for(l=1; l<=2*m+1; l++)
4501  {
4502  fg_psij2 *= fg_psij1;
4503  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4504  }
4505 
4506  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4507  }
4508 
4509  return;
4510  } /* if(PRE_FG_PSI) */
4511 
4512  if(ths->flags & FG_PSI)
4513  {
4514  R fg_exp_l[3*(2*m+2)];
4515 
4516  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4517  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4518  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4519 
4520  sort(ths);
4521 
4522 #ifdef _OPENMP
4523  #pragma omp parallel for default(shared) private(k)
4524 #endif
4525  for (k = 0; k < M; k++)
4526  {
4527  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4528  INT u, o, l;
4529  R psij_const[3*(2*m+2)];
4530  R fg_psij0, fg_psij1, fg_psij2;
4531 
4532  uo(ths,j,&u,&o,(INT)0);
4533  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4534  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4535  fg_psij2 = K(1.0);
4536  psij_const[0] = fg_psij0;
4537  for(l=1; l<=2*m+1; l++)
4538  {
4539  fg_psij2 *= fg_psij1;
4540  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4541  }
4542 
4543  uo(ths,j,&u,&o,(INT)1);
4544  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4545  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4546  fg_psij2 = K(1.0);
4547  psij_const[2*m+2] = fg_psij0;
4548  for(l=1; l<=2*m+1; l++)
4549  {
4550  fg_psij2 *= fg_psij1;
4551  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4552  }
4553 
4554  uo(ths,j,&u,&o,(INT)2);
4555  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4556  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4557  fg_psij2 = K(1.0);
4558  psij_const[2*(2*m+2)] = fg_psij0;
4559  for(l=1; l<=2*m+1; l++)
4560  {
4561  fg_psij2 *= fg_psij1;
4562  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4563  }
4564 
4565  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4566  }
4567 
4568  return;
4569  } /* if(FG_PSI) */
4570 
4571  if(ths->flags & PRE_LIN_PSI)
4572  {
4573  const INT K = ths->K, ip_s = K / (m + 2);
4574 
4575  sort(ths);
4576 
4577 #ifdef _OPENMP
4578  #pragma omp parallel for default(shared) private(k)
4579 #endif
4580  for (k = 0; k < M; k++)
4581  {
4582  INT u, o, l;
4583  R ip_y, ip_w;
4584  INT ip_u;
4585  R psij_const[3*(2*m+2)];
4586  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4587 
4588  uo(ths,j,&u,&o,(INT)0);
4589  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4590  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4591  ip_w = ip_y - (R)(ip_u);
4592  for(l=0; l < 2*m+2; l++)
4593  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4594  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4595 
4596  uo(ths,j,&u,&o,(INT)1);
4597  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4598  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4599  ip_w = ip_y - (R)(ip_u);
4600  for(l=0; l < 2*m+2; l++)
4601  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4602  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4603 
4604  uo(ths,j,&u,&o,(INT)2);
4605  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4606  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4607  ip_w = ip_y - (R)(ip_u);
4608  for(l=0; l < 2*m+2; l++)
4609  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4610  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4611 
4612  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4613  }
4614  return;
4615  } /* if(PRE_LIN_PSI) */
4616 
4617  /* no precomputed psi at all */
4618 
4619  sort(ths);
4620 
4621 #ifdef _OPENMP
4622  #pragma omp parallel for default(shared) private(k)
4623 #endif
4624  for (k = 0; k < M; k++)
4625  {
4626  R psij_const[3*(2*m+2)];
4627  INT u, o, l;
4628  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4629 
4630  uo(ths,j,&u,&o,(INT)0);
4631  for(l=0;l<=2*m+1;l++)
4632  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4633 
4634  uo(ths,j,&u,&o,(INT)1);
4635  for(l=0;l<=2*m+1;l++)
4636  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4637 
4638  uo(ths,j,&u,&o,(INT)2);
4639  for(l=0;l<=2*m+1;l++)
4640  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4641 
4642  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4643  }
4644 }
4645 
4646 #ifdef OMP_ASSERT
4647 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4648 { \
4649  assert(ar_x[2*k] >= min_u_a || k == M-1); \
4650  if (k > 0) \
4651  assert(ar_x[2*k-2] < min_u_a); \
4652 }
4653 #else
4654 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4655 #endif
4656 
4657 #ifdef OMP_ASSERT
4658 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4659 { \
4660  assert(ar_x[2*k] >= min_u_b || k == M-1); \
4661  if (k > 0) \
4662  assert(ar_x[2*k-2] < min_u_b); \
4663 }
4664 #else
4665 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4666 #endif
4667 
4668 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4669  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4670  ths->psi+j*3*(2*m+2), \
4671  ths->psi+(j*3+1)*(2*m+2), \
4672  ths->psi+(j*3+2)*(2*m+2), \
4673  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4674  n0, n1, n2, m, my_u0, my_o0);
4675 
4676 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4677 { \
4678  INT u, o, l; \
4679  R psij_const[3*(2*m+2)]; \
4680  R fg_psij0 = ths->psi[2*j*3]; \
4681  R fg_psij1 = ths->psi[2*j*3+1]; \
4682  R fg_psij2 = K(1.0); \
4683  \
4684  psij_const[0] = fg_psij0; \
4685  for(l=1; l<=2*m+1; l++) \
4686  { \
4687  fg_psij2 *= fg_psij1; \
4688  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4689  } \
4690  \
4691  fg_psij0 = ths->psi[2*(j*3+1)]; \
4692  fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4693  fg_psij2 = K(1.0); \
4694  psij_const[2*m+2] = fg_psij0; \
4695  for(l=1; l<=2*m+1; l++) \
4696  { \
4697  fg_psij2 *= fg_psij1; \
4698  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4699  } \
4700  \
4701  fg_psij0 = ths->psi[2*(j*3+2)]; \
4702  fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4703  fg_psij2 = K(1.0); \
4704  psij_const[2*(2*m+2)] = fg_psij0; \
4705  for(l=1; l<=2*m+1; l++) \
4706  { \
4707  fg_psij2 *= fg_psij1; \
4708  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4709  } \
4710  \
4711  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4712  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4713  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4714  n0, n1, n2, m, my_u0, my_o0); \
4715 }
4716 
4717 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4718 { \
4719  INT u, o, l; \
4720  R psij_const[3*(2*m+2)]; \
4721  R fg_psij0, fg_psij1, fg_psij2; \
4722  \
4723  uo(ths,j,&u,&o,(INT)0); \
4724  fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \
4725  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4726  fg_psij2 = K(1.0); \
4727  psij_const[0] = fg_psij0; \
4728  for(l=1; l<=2*m+1; l++) \
4729  { \
4730  fg_psij2 *= fg_psij1; \
4731  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4732  } \
4733  \
4734  uo(ths,j,&u,&o,(INT)1); \
4735  fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \
4736  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4737  fg_psij2 = K(1.0); \
4738  psij_const[2*m+2] = fg_psij0; \
4739  for(l=1; l<=2*m+1; l++) \
4740  { \
4741  fg_psij2 *= fg_psij1; \
4742  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4743  } \
4744  \
4745  uo(ths,j,&u,&o,(INT)2); \
4746  fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \
4747  fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4748  fg_psij2 = K(1.0); \
4749  psij_const[2*(2*m+2)] = fg_psij0; \
4750  for(l=1; l<=2*m+1; l++) \
4751  { \
4752  fg_psij2 *= fg_psij1; \
4753  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4754  } \
4755  \
4756  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4757  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4758  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4759  n0, n1, n2, m, my_u0, my_o0); \
4760 }
4761 
4762 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4763 { \
4764  INT u, o, l; \
4765  R psij_const[3*(2*m+2)]; \
4766  INT ip_u; \
4767  R ip_y, ip_w; \
4768  \
4769  uo(ths,j,&u,&o,(INT)0); \
4770  ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4771  ip_u = LRINT(FLOOR(ip_y)); \
4772  ip_w = ip_y-ip_u; \
4773  for(l=0; l < 2*m+2; l++) \
4774  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4775  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4776  \
4777  uo(ths,j,&u,&o,(INT)1); \
4778  ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4779  ip_u = LRINT(FLOOR(ip_y)); \
4780  ip_w = ip_y-ip_u; \
4781  for(l=0; l < 2*m+2; l++) \
4782  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4783  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4784  \
4785  uo(ths,j,&u,&o,(INT)2); \
4786  ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4787  ip_u = LRINT(FLOOR(ip_y)); \
4788  ip_w = ip_y-ip_u; \
4789  for(l=0; l < 2*m+2; l++) \
4790  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4791  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4792  \
4793  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4794  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4795  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4796  n0, n1, n2, m, my_u0, my_o0); \
4797 }
4798 
4799 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4800 { \
4801  INT u, o, l; \
4802  R psij_const[3*(2*m+2)]; \
4803  \
4804  uo(ths,j,&u,&o,(INT)0); \
4805  for(l=0;l<=2*m+1;l++) \
4806  psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \
4807  \
4808  uo(ths,j,&u,&o,(INT)1); \
4809  for(l=0;l<=2*m+1;l++) \
4810  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4811  \
4812  uo(ths,j,&u,&o,(INT)2); \
4813  for(l=0;l<=2*m+1;l++) \
4814  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4815  \
4816  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4817  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4818  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4819  n0, n1, n2, m, my_u0, my_o0); \
4820 }
4821 
4822 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4823 { \
4824  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4825  { \
4826  _Pragma("omp parallel private(k)") \
4827  { \
4828  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4829  INT *ar_x = ths->index_x; \
4830  \
4831  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4832  &min_u_b, &max_u_b, 3, ths->n, m); \
4833  \
4834  if (min_u_a != -1) \
4835  { \
4836  k = index_x_binary_search(ar_x, M, min_u_a); \
4837  \
4838  MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4839  \
4840  while (k < M) \
4841  { \
4842  INT u_prod = ar_x[2*k]; \
4843  INT j = ar_x[2*k+1]; \
4844  \
4845  if (u_prod < min_u_a || u_prod > max_u_a) \
4846  break; \
4847  \
4848  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4849  \
4850  k++; \
4851  } \
4852  } \
4853  \
4854  if (min_u_b != -1) \
4855  { \
4856  INT k = index_x_binary_search(ar_x, M, min_u_b); \
4857  \
4858  MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4859  \
4860  while (k < M) \
4861  { \
4862  INT u_prod = ar_x[2*k]; \
4863  INT j = ar_x[2*k+1]; \
4864  \
4865  if (u_prod < min_u_b || u_prod > max_u_b) \
4866  break; \
4867  \
4868  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4869  \
4870  k++; \
4871  } \
4872  } \
4873  } /* omp parallel */ \
4874  return; \
4875  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
4876 }
4877 
4878 static void nfft_adjoint_3d_B(X(plan) *ths)
4879 {
4880  INT k;
4881  const INT n0 = ths->n[0];
4882  const INT n1 = ths->n[1];
4883  const INT n2 = ths->n[2];
4884  const INT M = ths->M_total;
4885  const INT m = ths->m;
4886 
4887  C* g = (C*) ths->g;
4888 
4889  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
4890 
4891  if(ths->flags & PRE_FULL_PSI)
4892  {
4893  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
4894  (INT)3, ths->n, m, ths->flags, ths->index_x);
4895  return;
4896  } /* if(PRE_FULL_PSI) */
4897 
4898  if(ths->flags & PRE_PSI)
4899  {
4900 #ifdef _OPENMP
4901  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
4902 #endif
4903 
4904 #ifdef _OPENMP
4905  #pragma omp parallel for default(shared) private(k)
4906 #endif
4907  for (k = 0; k < M; k++)
4908  {
4909  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4910 #ifdef _OPENMP
4911  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4912 #else
4913  nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4914 #endif
4915  }
4916  return;
4917  } /* if(PRE_PSI) */
4918 
4919  if(ths->flags & PRE_FG_PSI)
4920  {
4921  R fg_exp_l[3*(2*m+2)];
4922 
4923  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4924  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4925  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4926 
4927 #ifdef _OPENMP
4928  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
4929 #endif
4930 
4931 #ifdef _OPENMP
4932  #pragma omp parallel for default(shared) private(k)
4933 #endif
4934  for (k = 0; k < M; k++)
4935  {
4936  R psij_const[3*(2*m+2)];
4937  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4938  INT l;
4939  R fg_psij0 = ths->psi[2*j*3];
4940  R fg_psij1 = ths->psi[2*j*3+1];
4941  R fg_psij2 = K(1.0);
4942 
4943  psij_const[0] = fg_psij0;
4944  for(l=1; l<=2*m+1; l++)
4945  {
4946  fg_psij2 *= fg_psij1;
4947  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4948  }
4949 
4950  fg_psij0 = ths->psi[2*(j*3+1)];
4951  fg_psij1 = ths->psi[2*(j*3+1)+1];
4952  fg_psij2 = K(1.0);
4953  psij_const[2*m+2] = fg_psij0;
4954  for(l=1; l<=2*m+1; l++)
4955  {
4956  fg_psij2 *= fg_psij1;
4957  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4958  }
4959 
4960  fg_psij0 = ths->psi[2*(j*3+2)];
4961  fg_psij1 = ths->psi[2*(j*3+2)+1];
4962  fg_psij2 = K(1.0);
4963  psij_const[2*(2*m+2)] = fg_psij0;
4964  for(l=1; l<=2*m+1; l++)
4965  {
4966  fg_psij2 *= fg_psij1;
4967  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4968  }
4969 
4970 #ifdef _OPENMP
4971  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4972 #else
4973  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4974 #endif
4975  }
4976 
4977  return;
4978  } /* if(PRE_FG_PSI) */
4979 
4980  if(ths->flags & FG_PSI)
4981  {
4982  R fg_exp_l[3*(2*m+2)];
4983 
4984  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4985  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4986  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4987 
4988  sort(ths);
4989 
4990 #ifdef _OPENMP
4991  MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
4992 #endif
4993 
4994 #ifdef _OPENMP
4995  #pragma omp parallel for default(shared) private(k)
4996 #endif
4997  for (k = 0; k < M; k++)
4998  {
4999  INT u,o,l;
5000  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5001  R psij_const[3*(2*m+2)];
5002  R fg_psij0, fg_psij1, fg_psij2;
5003 
5004  uo(ths,j,&u,&o,(INT)0);
5005  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
5006  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
5007  fg_psij2 = K(1.0);
5008  psij_const[0] = fg_psij0;
5009  for(l=1; l<=2*m+1; l++)
5010  {
5011  fg_psij2 *= fg_psij1;
5012  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5013  }
5014 
5015  uo(ths,j,&u,&o,(INT)1);
5016  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
5017  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
5018  fg_psij2 = K(1.0);
5019  psij_const[2*m+2] = fg_psij0;
5020  for(l=1; l<=2*m+1; l++)
5021  {
5022  fg_psij2 *= fg_psij1;
5023  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5024  }
5025 
5026  uo(ths,j,&u,&o,(INT)2);
5027  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5028  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5029  fg_psij2 = K(1.0);
5030  psij_const[2*(2*m+2)] = fg_psij0;
5031  for(l=1; l<=2*m+1; l++)
5032  {
5033  fg_psij2 *= fg_psij1;
5034  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5035  }
5036 
5037 #ifdef _OPENMP
5038  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5039 #else
5040  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5041 #endif
5042  }
5043 
5044  return;
5045  } /* if(FG_PSI) */
5046 
5047  if(ths->flags & PRE_LIN_PSI)
5048  {
5049  const INT K = ths->K;
5050  const INT ip_s = K / (m + 2);
5051 
5052  sort(ths);
5053 
5054 #ifdef _OPENMP
5055  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5056 #endif
5057 
5058 #ifdef _OPENMP
5059  #pragma omp parallel for default(shared) private(k)
5060 #endif
5061  for (k = 0; k < M; k++)
5062  {
5063  INT u,o,l;
5064  INT ip_u;
5065  R ip_y, ip_w;
5066  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5067  R psij_const[3*(2*m+2)];
5068 
5069  uo(ths,j,&u,&o,(INT)0);
5070  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5071  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5072  ip_w = ip_y - (R)(ip_u);
5073  for(l=0; l < 2*m+2; l++)
5074  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5075  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5076 
5077  uo(ths,j,&u,&o,(INT)1);
5078  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5079  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5080  ip_w = ip_y - (R)(ip_u);
5081  for(l=0; l < 2*m+2; l++)
5082  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5083  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5084 
5085  uo(ths,j,&u,&o,(INT)2);
5086  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5087  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5088  ip_w = ip_y - (R)(ip_u);
5089  for(l=0; l < 2*m+2; l++)
5090  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5091  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5092 
5093 #ifdef _OPENMP
5094  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5095 #else
5096  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5097 #endif
5098  }
5099  return;
5100  } /* if(PRE_LIN_PSI) */
5101 
5102  /* no precomputed psi at all */
5103  sort(ths);
5104 
5105 #ifdef _OPENMP
5106  MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5107 #endif
5108 
5109 #ifdef _OPENMP
5110  #pragma omp parallel for default(shared) private(k)
5111 #endif
5112  for (k = 0; k < M; k++)
5113  {
5114  INT u,o,l;
5115  R psij_const[3*(2*m+2)];
5116  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5117 
5118  uo(ths,j,&u,&o,(INT)0);
5119  for(l=0;l<=2*m+1;l++)
5120  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5121 
5122  uo(ths,j,&u,&o,(INT)1);
5123  for(l=0;l<=2*m+1;l++)
5124  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5125 
5126  uo(ths,j,&u,&o,(INT)2);
5127  for(l=0;l<=2*m+1;l++)
5128  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5129 
5130 #ifdef _OPENMP
5131  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5132 #else
5133  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5134 #endif
5135  }
5136 }
5137 
5138 
5139 void X(trafo_3d)(X(plan) *ths)
5140 {
5141  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5142  {
5143  X(trafo_direct)(ths);
5144  return;
5145  }
5146 
5147  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5148  C *g_hat,*f_hat;
5149  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5150  R ck01, ck02, ck11, ck12, ck21, ck22;
5151  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5152  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5153 
5154  ths->g_hat=ths->g1;
5155  ths->g=ths->g2;
5156 
5157  N0=ths->N[0];
5158  N1=ths->N[1];
5159  N2=ths->N[2];
5160  n0=ths->n[0];
5161  n1=ths->n[1];
5162  n2=ths->n[2];
5163 
5164  f_hat=(C*)ths->f_hat;
5165  g_hat=(C*)ths->g_hat;
5166 
5167  TIC(0)
5168 #ifdef _OPENMP
5169  #pragma omp parallel for default(shared) private(k0)
5170  for (k0 = 0; k0 < ths->n_total; k0++)
5171  ths->g_hat[k0] = 0.0;
5172 #else
5173  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
5174 #endif
5175 
5176  if(ths->flags & PRE_PHI_HUT)
5177  {
5178  c_phi_inv01=ths->c_phi_inv[0];
5179  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5180 
5181 #ifdef _OPENMP
5182  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5183 #endif
5184  for(k0=0;k0<N0/2;k0++)
5185  {
5186  ck01=c_phi_inv01[k0];
5187  ck02=c_phi_inv02[k0];
5188  c_phi_inv11=ths->c_phi_inv[1];
5189  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5190 
5191  for(k1=0;k1<N1/2;k1++)
5192  {
5193  ck11=c_phi_inv11[k1];
5194  ck12=c_phi_inv12[k1];
5195  c_phi_inv21=ths->c_phi_inv[2];
5196  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5197 
5198  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5199  f_hat111=f_hat + (k0*N1+k1)*N2;
5200  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5201  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5202  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5203  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5204  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5205  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5206 
5207  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5208  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5209  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5210  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5211  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5212  f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5213  g_hat222=g_hat + (k0*n1+k1)*n2;
5214  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5215 
5216  for(k2=0;k2<N2/2;k2++)
5217  {
5218  ck21=c_phi_inv21[k2];
5219  ck22=c_phi_inv22[k2];
5220 
5221  g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5222  g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5223  g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5224  g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5225 
5226  g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5227  g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5228  g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5229  g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5230  }
5231  }
5232  }
5233  }
5234  else
5235 #ifdef _OPENMP
5236  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5237 #endif
5238  for(k0=0;k0<N0/2;k0++)
5239  {
5240  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5241  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5242  for(k1=0;k1<N1/2;k1++)
5243  {
5244  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5245  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5246 
5247  for(k2=0;k2<N2/2;k2++)
5248  {
5249  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5250  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5251 
5252  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5253  g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5254  g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5255  g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5256 
5257  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5258  g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5259  g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5260  g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5261  }
5262  }
5263  }
5264 
5265  TOC(0)
5266 
5267  TIC_FFTW(1)
5268  FFTW(execute)(ths->my_fftw_plan1);
5269  TOC_FFTW(1);
5270 
5271  TIC(2);
5272  nfft_trafo_3d_B(ths);
5273  TOC(2);
5274 }
5275 
5276 void X(adjoint_3d)(X(plan) *ths)
5277 {
5278  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5279  {
5280  X(adjoint_direct)(ths);
5281  return;
5282  }
5283 
5284  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5285  C *g_hat,*f_hat;
5286  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5287  R ck01, ck02, ck11, ck12, ck21, ck22;
5288  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5289  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5290 
5291  ths->g_hat=ths->g1;
5292  ths->g=ths->g2;
5293 
5294  N0=ths->N[0];
5295  N1=ths->N[1];
5296  N2=ths->N[2];
5297  n0=ths->n[0];
5298  n1=ths->n[1];
5299  n2=ths->n[2];
5300 
5301  f_hat=(C*)ths->f_hat;
5302  g_hat=(C*)ths->g_hat;
5303 
5304  TIC(2);
5305  nfft_adjoint_3d_B(ths);
5306  TOC(2);
5307 
5308  TIC_FFTW(1)
5309  FFTW(execute)(ths->my_fftw_plan2);
5310  TOC_FFTW(1);
5311 
5312  TIC(0)
5313  if(ths->flags & PRE_PHI_HUT)
5314  {
5315  c_phi_inv01=ths->c_phi_inv[0];
5316  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5317 
5318 #ifdef _OPENMP
5319  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5320 #endif
5321  for(k0=0;k0<N0/2;k0++)
5322  {
5323  ck01=c_phi_inv01[k0];
5324  ck02=c_phi_inv02[k0];
5325  c_phi_inv11=ths->c_phi_inv[1];
5326  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5327 
5328  for(k1=0;k1<N1/2;k1++)
5329  {
5330  ck11=c_phi_inv11[k1];
5331  ck12=c_phi_inv12[k1];
5332  c_phi_inv21=ths->c_phi_inv[2];
5333  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5334 
5335  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5336  f_hat111=f_hat + (k0*N1+k1)*N2;
5337  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5338  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5339  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5340  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5341  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5342  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5343 
5344  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5345  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5346  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5347  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5348  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5349  f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5350  g_hat222=g_hat + (k0*n1+k1)*n2;
5351  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5352 
5353  for(k2=0;k2<N2/2;k2++)
5354  {
5355  ck21=c_phi_inv21[k2];
5356  ck22=c_phi_inv22[k2];
5357 
5358  f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5359  f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5360  f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5361  f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5362 
5363  f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5364  f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5365  f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5366  f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5367  }
5368  }
5369  }
5370  }
5371  else
5372 #ifdef _OPENMP
5373  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5374 #endif
5375  for(k0=0;k0<N0/2;k0++)
5376  {
5377  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5378  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5379  for(k1=0;k1<N1/2;k1++)
5380  {
5381  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5382  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5383 
5384  for(k2=0;k2<N2/2;k2++)
5385  {
5386  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5387  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5388 
5389  f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5390  f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5391  f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5392  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5393 
5394  f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5395  f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5396  f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5397  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5398  }
5399  }
5400  }
5401 
5402  TOC(0)
5403 }
5404 
5407 void X(trafo)(X(plan) *ths)
5408 {
5409  /* use direct transform if degree N is too low */
5410  for (int j = 0; j < ths->d; j++)
5411  {
5412  if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5413  {
5414  X(trafo_direct)(ths);
5415  return;
5416  }
5417  }
5418 
5419  switch(ths->d)
5420  {
5421  case 1: X(trafo_1d)(ths); break;
5422  case 2: X(trafo_2d)(ths); break;
5423  case 3: X(trafo_3d)(ths); break;
5424  default:
5425  {
5426  /* use ths->my_fftw_plan1 */
5427  ths->g_hat = ths->g1;
5428  ths->g = ths->g2;
5429 
5433  TIC(0)
5434  D_A(ths);
5435  TOC(0)
5436 
5441  TIC_FFTW(1)
5442  FFTW(execute)(ths->my_fftw_plan1);
5443  TOC_FFTW(1)
5444 
5448  TIC(2)
5449  B_A(ths);
5450  TOC(2)
5451  }
5452  }
5453 } /* nfft_trafo */
5454 
5455 void X(adjoint)(X(plan) *ths)
5456 {
5457  /* use direct transform if degree N is too low */
5458  for (int j = 0; j < ths->d; j++)
5459  {
5460  if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5461  {
5462  X(adjoint_direct)(ths);
5463  return;
5464  }
5465  }
5466 
5467  switch(ths->d)
5468  {
5469  case 1: X(adjoint_1d)(ths); break;
5470  case 2: X(adjoint_2d)(ths); break;
5471  case 3: X(adjoint_3d)(ths); break;
5472  default:
5473  {
5474  /* use ths->my_fftw_plan2 */
5475  ths->g_hat=ths->g1;
5476  ths->g=ths->g2;
5477 
5481  TIC(2)
5482  B_T(ths);
5483  TOC(2)
5484 
5489  TIC_FFTW(1)
5490  FFTW(execute)(ths->my_fftw_plan2);
5491  TOC_FFTW(1)
5492 
5496  TIC(0)
5497  D_T(ths);
5498  TOC(0)
5499  }
5500  }
5501 } /* nfft_adjoint */
5502 
5503 
5506 static void precompute_phi_hut(X(plan) *ths)
5507 {
5508  INT ks[ths->d]; /* index over all frequencies */
5509  INT t; /* index over all dimensions */
5510 
5511  ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*));
5512 
5513  for (t = 0; t < ths->d; t++)
5514  {
5515  ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) * sizeof(R));
5516 
5517  for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5518  {
5519  ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5520  }
5521  }
5522 } /* nfft_phi_hut */
5523 
5528 void X(precompute_lin_psi)(X(plan) *ths)
5529 {
5530  INT t;
5531  INT j;
5532  R step;
5534  for (t=0; t<ths->d; t++)
5535  {
5536  step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5537  for(j = 0;j <= ths->K; j++)
5538  {
5539  ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5540  } /* for(j) */
5541  } /* for(t) */
5542 }
5543 
5544 void X(precompute_fg_psi)(X(plan) *ths)
5545 {
5546  INT t;
5547  INT u, o;
5549  sort(ths);
5550 
5551  for (t=0; t<ths->d; t++)
5552  {
5553  INT j;
5554 #ifdef _OPENMP
5555  #pragma omp parallel for default(shared) private(j,u,o)
5556 #endif
5557  for (j = 0; j < ths->M_total; j++)
5558  {
5559  uo(ths,j,&u,&o,t);
5560 
5561  ths->psi[2*(j*ths->d+t)]=
5562  (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5563 
5564  ths->psi[2*(j*ths->d+t)+1]=
5565  EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5566  } /* for(j) */
5567  }
5568  /* for(t) */
5569 } /* nfft_precompute_fg_psi */
5570 
5571 void X(precompute_psi)(X(plan) *ths)
5572 {
5573  INT t; /* index over all dimensions */
5574  INT l; /* index u<=l<=o */
5575  INT lj; /* index 0<=lj<u+o+1 */
5576  INT u, o; /* depends on x_j */
5577 
5578  sort(ths);
5579 
5580  for (t=0; t<ths->d; t++)
5581  {
5582  INT j;
5583 #ifdef _OPENMP
5584  #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5585 #endif
5586  for (j = 0; j < ths->M_total; j++)
5587  {
5588  uo(ths,j,&u,&o,t);
5589 
5590  for(l = u, lj = 0; l <= o; l++, lj++)
5591  ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5592  (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5593  } /* for(j) */
5594  }
5595  /* for(t) */
5596 } /* nfft_precompute_psi */
5597 
5598 #ifdef _OPENMP
5599 static void nfft_precompute_full_psi_omp(X(plan) *ths)
5600 {
5601  INT j;
5602  INT lprod;
5604  {
5605  INT t;
5606  for(t=0,lprod = 1; t<ths->d; t++)
5607  lprod *= 2*ths->m+2;
5608  }
5609 
5610  #pragma omp parallel for default(shared) private(j)
5611  for(j=0; j<ths->M_total; j++)
5612  {
5613  INT t,t2;
5614  INT l_L;
5615  INT l[ths->d];
5616  INT lj[ths->d];
5617  INT ll_plain[ths->d+1];
5619  INT u[ths->d], o[ths->d];
5621  R phi_prod[ths->d+1];
5622  INT ix = j*lprod;
5623 
5624  phi_prod[0]=1;
5625  ll_plain[0]=0;
5626 
5627  MACRO_init_uo_l_lj_t;
5628 
5629  for(l_L=0; l_L<lprod; l_L++, ix++)
5630  {
5631  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5632 
5633  ths->psi_index_g[ix]=ll_plain[ths->d];
5634  ths->psi[ix]=phi_prod[ths->d];
5635 
5636  MACRO_count_uo_l_lj_t;
5637  } /* for(l_L) */
5638 
5639  ths->psi_index_f[j]=lprod;
5640  } /* for(j) */
5641 }
5642 #endif
5643 
5644 void X(precompute_full_psi)(X(plan) *ths)
5645 {
5646 #ifdef _OPENMP
5647  sort(ths);
5648 
5649  nfft_precompute_full_psi_omp(ths);
5650 #else
5651  INT t, t2; /* index over all dimensions */
5652  INT j; /* index over all nodes */
5653  INT l_L; /* plain index 0 <= l_L < lprod */
5654  INT l[ths->d]; /* multi index u<=l<=o */
5655  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
5656  INT ll_plain[ths->d+1]; /* postfix plain index */
5657  INT lprod; /* 'bandwidth' of matrix B */
5658  INT u[ths->d], o[ths->d]; /* depends on x_j */
5659 
5660  R phi_prod[ths->d+1];
5661 
5662  INT ix, ix_old;
5663 
5664  sort(ths);
5665 
5666  phi_prod[0] = K(1.0);
5667  ll_plain[0] = 0;
5668 
5669  for (t = 0, lprod = 1; t < ths->d; t++)
5670  lprod *= 2 * ths->m + 2;
5671 
5672  for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5673  {
5674  MACRO_init_uo_l_lj_t;
5675 
5676  for (l_L = 0; l_L < lprod; l_L++, ix++)
5677  {
5678  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5679 
5680  ths->psi_index_g[ix] = ll_plain[ths->d];
5681  ths->psi[ix] = phi_prod[ths->d];
5682 
5683  MACRO_count_uo_l_lj_t;
5684  } /* for(l_L) */
5685 
5686  ths->psi_index_f[j] = ix - ix_old;
5687  ix_old = ix;
5688  } /* for(j) */
5689 #endif
5690 }
5691 
5692 void X(precompute_one_psi)(X(plan) *ths)
5693 {
5694  if(ths->flags & PRE_LIN_PSI)
5695  X(precompute_lin_psi)(ths);
5696  if(ths->flags & PRE_FG_PSI)
5697  X(precompute_fg_psi)(ths);
5698  if(ths->flags & PRE_PSI)
5699  X(precompute_psi)(ths);
5700  if(ths->flags & PRE_FULL_PSI)
5701  X(precompute_full_psi)(ths);
5702 }
5703 
5704 static void init_help(X(plan) *ths)
5705 {
5706  INT t; /* index over all dimensions */
5707  INT lprod; /* 'bandwidth' of matrix B */
5708 
5709  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5710  ths->flags |= NFFT_SORT_NODES;
5711 
5712  ths->N_total = intprod(ths->N, 0, ths->d);
5713  ths->n_total = intprod(ths->n, 0, ths->d);
5714 
5715  ths->sigma = (R*) Y(malloc)((size_t)(ths->d) * sizeof(R));
5716 
5717  for(t = 0;t < ths->d; t++)
5718  ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5719 
5720  WINDOW_HELP_INIT;
5721 
5722  if(ths->flags & MALLOC_X)
5723  ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R));
5724 
5725  if(ths->flags & MALLOC_F_HAT)
5726  ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) * sizeof(C));
5727 
5728  if(ths->flags & MALLOC_F)
5729  ths->f = (C*)Y(malloc)((size_t)(ths->M_total) * sizeof(C));
5730 
5731  if(ths->flags & PRE_PHI_HUT)
5732  precompute_phi_hut(ths);
5733 
5734  if (ths->flags & PRE_LIN_PSI)
5735  {
5736  if (ths->K == 0)
5737  {
5738  ths->K = Y(m2K)(ths->m);
5739  }
5740  ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) * sizeof(R));
5741  }
5742 
5743  if(ths->flags & PRE_FG_PSI)
5744  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R));
5745 
5746  if(ths->flags & PRE_PSI)
5747  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) * sizeof(R));
5748 
5749  if(ths->flags & PRE_FULL_PSI)
5750  {
5751  for (t = 0, lprod = 1; t < ths->d; t++)
5752  lprod *= 2 * ths->m + 2;
5753 
5754  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R));
5755 
5756  ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT));
5757  ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT));
5758  }
5759 
5760  if(ths->flags & FFTW_INIT)
5761  {
5762 #ifdef _OPENMP
5763  INT nthreads = Y(get_num_threads)();
5764 #endif
5765 
5766  ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) * sizeof(C));
5767 
5768  if(ths->flags & FFT_OUT_OF_PLACE)
5769  ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) * sizeof(C));
5770  else
5771  ths->g2 = ths->g1;
5772 
5773 #ifdef _OPENMP
5774 #pragma omp critical (nfft_omp_critical_fftw_plan)
5775 {
5776  FFTW(plan_with_nthreads)(nthreads);
5777 #endif
5778  {
5779  int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int));
5780 
5781  for (t = 0; t < ths->d; t++)
5782  _n[t] = (int)(ths->n[t]);
5783 
5784  ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
5785  ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
5786  Y(free)(_n);
5787  }
5788 #ifdef _OPENMP
5789 }
5790 #endif
5791  }
5792 
5793  if(ths->flags & NFFT_SORT_NODES)
5794  ths->index_x = (INT*) Y(malloc)(sizeof(INT) * 2U * (size_t)(ths->M_total));
5795  else
5796  ths->index_x = NULL;
5797 
5798  ths->mv_trafo = (void (*) (void* ))X(trafo);
5799  ths->mv_adjoint = (void (*) (void* ))X(adjoint);
5800 }
5801 
5802 void X(init)(X(plan) *ths, int d, int *N, int M_total)
5803 {
5804  INT t; /* index over all dimensions */
5805 
5806  ths->d = (INT)d;
5807 
5808  ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
5809 
5810  for (t = 0; t < d; t++)
5811  ths->N[t] = (INT)N[t];
5812 
5813  ths->M_total = (INT)M_total;
5814 
5815  ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
5816 
5817  for (t = 0; t < d; t++)
5818  ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
5819 
5820  ths->m = WINDOW_HELP_ESTIMATE_m;
5821 
5822  if (d > 1)
5823  {
5824 #ifdef _OPENMP
5825  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5826  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
5827  NFFT_OMP_BLOCKWISE_ADJOINT;
5828 #else
5829  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5830  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
5831 #endif
5832  }
5833  else
5834  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5836 
5837  ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5838 
5839  ths->K = 0;
5840  init_help(ths);
5841 }
5842 
5843 void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m,
5844  unsigned flags, unsigned fftw_flags)
5845 {
5846  INT t; /* index over all dimensions */
5847 
5848  ths->d = (INT)d;
5849  ths->M_total = (INT)M_total;
5850  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5851 
5852  for (t = 0; t < d; t++)
5853  ths->N[t] = (INT)N[t];
5854 
5855  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5856 
5857  for (t = 0; t < d; t++)
5858  ths->n[t] = (INT)n[t];
5859 
5860  ths->m = (INT)m;
5861 
5862  ths->flags = flags;
5863  ths->fftw_flags = fftw_flags;
5864 
5865  ths->K = 0;
5866  init_help(ths);
5867 }
5868 
5869 void X(init_lin)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, int K,
5870  unsigned flags, unsigned fftw_flags)
5871 {
5872  INT t; /* index over all dimensions */
5873 
5874  ths->d = (INT)d;
5875  ths->M_total = (INT)M_total;
5876  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5877 
5878  for (t = 0; t < d; t++)
5879  ths->N[t] = (INT)N[t];
5880 
5881  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5882 
5883  for (t = 0; t < d; t++)
5884  ths->n[t] = (INT)n[t];
5885 
5886  ths->m = (INT)m;
5887 
5888  ths->flags = flags;
5889  ths->fftw_flags = fftw_flags;
5890 
5891  ths->K = K;
5892  init_help(ths);
5893 }
5894 
5895 void X(init_1d)(X(plan) *ths, int N1, int M_total)
5896 {
5897  int N[1];
5898 
5899  N[0] = N1;
5900 
5901  X(init)(ths, 1, N, M_total);
5902 }
5903 
5904 void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total)
5905 {
5906  int N[2];
5907 
5908  N[0] = N1;
5909  N[1] = N2;
5910  X(init)(ths, 2, N, M_total);
5911 }
5912 
5913 void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total)
5914 {
5915  int N[3];
5916 
5917  N[0] = N1;
5918  N[1] = N2;
5919  N[2] = N3;
5920  X(init)(ths, 3, N, M_total);
5921 }
5922 
5923 const char* X(check)(X(plan) *ths)
5924 {
5925  INT j;
5926 
5927  if (!ths->f)
5928  return "Member f not initialized.";
5929 
5930  if (!ths->x)
5931  return "Member x not initialized.";
5932 
5933  if (!ths->f_hat)
5934  return "Member f_hat not initialized.";
5935 
5936  if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
5937  return "Number of nodes too small to use PRE_LIN_PSI.";
5938 
5939  for (j = 0; j < ths->M_total * ths->d; j++)
5940  {
5941  if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
5942  {
5943  return "ths->x out of range [-0.5,0.5)";
5944  }
5945  }
5946 
5947  for (j = 0; j < ths->d; j++)
5948  {
5949  if (ths->sigma[j] <= 1)
5950  return "Oversampling factor too small";
5951 
5952  /* Automatically calls trafo_direct if
5953  if(ths->N[j] <= ths->m)
5954  return "Polynomial degree N is <= cut-off m";
5955  */
5956 
5957  if(ths->N[j]%2 == 1)
5958  return "polynomial degree N has to be even";
5959  }
5960  return 0;
5961 }
5962 
5963 void X(finalize)(X(plan) *ths)
5964 {
5965  INT t; /* index over dimensions */
5966 
5967  if(ths->flags & NFFT_SORT_NODES)
5968  Y(free)(ths->index_x);
5969 
5970  if(ths->flags & FFTW_INIT)
5971  {
5972 #ifdef _OPENMP
5973  #pragma omp critical (nfft_omp_critical_fftw_plan)
5974 #endif
5975  FFTW(destroy_plan)(ths->my_fftw_plan2);
5976 #ifdef _OPENMP
5977  #pragma omp critical (nfft_omp_critical_fftw_plan)
5978 #endif
5979  FFTW(destroy_plan)(ths->my_fftw_plan1);
5980 
5981  if(ths->flags & FFT_OUT_OF_PLACE)
5982  Y(free)(ths->g2);
5983 
5984  Y(free)(ths->g1);
5985  }
5986 
5987  if(ths->flags & PRE_FULL_PSI)
5988  {
5989  Y(free)(ths->psi_index_g);
5990  Y(free)(ths->psi_index_f);
5991  Y(free)(ths->psi);
5992  }
5993 
5994  if(ths->flags & PRE_PSI)
5995  Y(free)(ths->psi);
5996 
5997  if(ths->flags & PRE_FG_PSI)
5998  Y(free)(ths->psi);
5999 
6000  if(ths->flags & PRE_LIN_PSI)
6001  Y(free)(ths->psi);
6002 
6003  if(ths->flags & PRE_PHI_HUT)
6004  {
6005  for (t = 0; t < ths->d; t++)
6006  Y(free)(ths->c_phi_inv[t]);
6007  Y(free)(ths->c_phi_inv);
6008  }
6009 
6010  if(ths->flags & MALLOC_F)
6011  Y(free)(ths->f);
6012 
6013  if(ths->flags & MALLOC_F_HAT)
6014  Y(free)(ths->f_hat);
6015 
6016  if(ths->flags & MALLOC_X)
6017  Y(free)(ths->x);
6018 
6019  WINDOW_HELP_FINALIZE;
6020 
6021  Y(free)(ths->sigma);
6022  Y(free)(ths->n);
6023  Y(free)(ths->N);
6024 }
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
Definition: infft.h:1397
#define PRE_FG_PSI
Definition: nfft3.h:196
#define MALLOC_X
Definition: nfft3.h:199
#define MALLOC_F_HAT
Definition: nfft3.h:200
#define FG_PSI
Definition: nfft3.h:194
#define FFTW_INIT
Definition: nfft3.h:203
#define MALLOC_F
Definition: nfft3.h:201
#define X(name)
Include header for C99 complex datatype.
Definition: fastsum.h:57
#define FFT_OUT_OF_PLACE
Definition: nfft3.h:202
#define PRE_LIN_PSI
Definition: nfft3.h:195
#define PRE_PSI
Definition: nfft3.h:197
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Definition: infft.h:1365
#define PRE_FULL_PSI
Definition: nfft3.h:198
Header file for the nfft3 library.
#define PRE_PHI_HUT
Definition: nfft3.h:193