50 -0.000224352093802f, 0.0107500557815f, -0.0456795873917f,
51 0.11282500582f, -0.208067578452f, 0.28717837501f,
52 -0.254675294431f, 0.0481081835026f, 0.227861357867f,
53 -0.365411839137f, 0.280729061131f, -0.0935061787728f
56 0.00543499018201f, -0.0173890685681f, 0.0229166931429f,
57 0.00278413661237f, -0.104628958675f, 0.33619239719f,
58 -0.683033899655f, 0.954061589374f, -0.891273574569f,
59 0.525088317271f, -0.155131206606f, 0.00512245855404f
62 -0.00495335976478f, -0.017859491302f, -0.0413714373155f,
63 -0.0882148408885f, -0.17922965812f, -0.338261800753f,
64 -0.557688699732f, -0.735157736148f, -0.719057381172f,
65 -0.517871025209f, -0.280197469471f, -0.0852751354531f
68 0.0092579876872f, 0.0273493725543f, 0.0744756910287f,
69 0.178349677457f, 0.39601340223f, 0.829229533354f,
70 1.61298538328f, 2.79987398682f, 4.16396166128f,
71 5.29724826804f, 5.99598602388f, 6.3048492377f
116 const __m128 vx = _mm_set1_ps(
x);
117 __m128 sumR = _mm_setzero_ps(), sumI = _mm_setzero_ps();
120 __m128 vr = _mm_load_ps(&sR[
i]);
121 __m128 vi = _mm_load_ps(&sI[
i]);
122 __m128 vpr = _mm_load_ps(&
polesR[
i]);
123 __m128 vpi = _mm_load_ps(&
polesI[
i]);
124 __m128 vcr = _mm_load_ps(&
coeffsR[
i]);
125 __m128 vci = _mm_load_ps(&
coeffsI[
i]);
127 __m128 rpr = _mm_mul_ps(vr, vpr);
128 __m128 impi = _mm_mul_ps(vi, vpi);
129 __m128 xcr = _mm_mul_ps(vx, vcr);
130 __m128 nr = _mm_add_ps(_mm_sub_ps(rpr, impi), xcr);
132 __m128 rpi = _mm_mul_ps(vr, vpi);
133 __m128 impr = _mm_mul_ps(vi, vpr);
134 __m128 xci = _mm_mul_ps(vx, vci);
135 __m128 ni = _mm_add_ps(_mm_add_ps(rpi, impr), xci);
137 _mm_store_ps(&sR[
i], nr);
138 _mm_store_ps(&sI[
i], ni);
140 sumR = _mm_add_ps(sumR, nr);
141 sumI = _mm_add_ps(sumI, ni);
143 float tmpR[4], tmpI[4];
144 _mm_storeu_ps(tmpR, sumR);
145 _mm_storeu_ps(tmpI, sumI);
146 out[0] =
x *
direct + (tmpR[0] + tmpR[1] + tmpR[2] + tmpR[3]);
147 out[1] = (tmpI[0] + tmpI[1] + tmpI[2] + tmpI[3]);
149 float sumR = 0.0f, sumI = 0.0f;
155 sR[
i] = nr; sI[
i] = ni;
156 sumR += nr; sumI += ni;
165 const float xr =
x[0], xi =
x[1];
168 const __m128 vxr = _mm_set1_ps(xr), vxi = _mm_set1_ps(xi);
169 __m128 sumR = _mm_setzero_ps(), sumI = _mm_setzero_ps();
172 __m128 vr = _mm_load_ps(&sR[
i]);
173 __m128 vi = _mm_load_ps(&sI[
i]);
174 __m128 vpr = _mm_load_ps(&
polesR[
i]);
175 __m128 vpi = _mm_load_ps(&
polesI[
i]);
176 __m128 vcr = _mm_load_ps(&
coeffsR[
i]);
177 __m128 vci = _mm_load_ps(&
coeffsI[
i]);
179 __m128 xrcr = _mm_mul_ps(vxr, vcr);
180 __m128 xici = _mm_mul_ps(vxi, vci);
181 __m128 xrci = _mm_mul_ps(vxr, vci);
182 __m128 xicr = _mm_mul_ps(vxi, vcr);
184 __m128 rpr = _mm_mul_ps(vr, vpr);
185 __m128 impi = _mm_mul_ps(vi, vpi);
186 __m128 rpi = _mm_mul_ps(vr, vpi);
187 __m128 impr = _mm_mul_ps(vi, vpr);
189 __m128 nr = _mm_add_ps(_mm_sub_ps(rpr, impi), _mm_sub_ps(xrcr, xici));
190 __m128 ni = _mm_add_ps(_mm_add_ps(rpi, impr), _mm_add_ps(xrci, xicr));
192 _mm_store_ps(&sR[
i], nr);
193 _mm_store_ps(&sI[
i], ni);
195 sumR = _mm_add_ps(sumR, nr);
196 sumI = _mm_add_ps(sumI, ni);
198 float tmpR[4], tmpI[4];
199 _mm_storeu_ps(tmpR, sumR);
200 _mm_storeu_ps(tmpI, sumI);
201 const float sr = tmpR[0] + tmpR[1] + tmpR[2] + tmpR[3];
202 const float si = tmpI[0] + tmpI[1] + tmpI[2] + tmpI[3];
206 float sumR = 0.0f, sumI = 0.0f;
209 const float r = sR[
i], im = sI[
i];
211 const float xrcr = xr * cr, xici = xi * ci, xrci = xr * ci, xicr = xi * cr;
212 const float nr =
r * pr - im *
pi + (xrcr - xici);
213 const float ni =
r *
pi + im * pr + (xrci + xicr);
214 sR[
i] = nr; sI[
i] = ni;
215 sumR += nr; sumI += ni;