|
1 | 1 | static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) { |
2 | | - float ctemp = c; |
3 | | - float * caddr = &ctemp; |
4 | | - __asm__ __volatile__ ( |
5 | | - "mov r0, %0 @ \n\t" |
6 | | - "ldr r4, [%1] @ \n\t" |
7 | | - "vdup.32 q12, r4 @ \n\t" |
8 | | - "vdup.32 q13, r4 @ \n\t" |
9 | | - "lsrs r4, %2, #3 @ \n\t" |
10 | | - "beq 3f @ \n\t" |
11 | | - "1: @ \n\t" |
12 | | - "vst1.32 {d24-d27}, [r0]! @ \n\t" |
13 | | - "subs r4, r4, #1 @ \n\t" |
14 | | - "bne 1b @ \n\t" |
15 | | - "3: @ \n\t" |
16 | | - "ands r4, %2, #7 @ \n\t" |
17 | | - "beq 5f @ \n\t" |
18 | | - "4: @ \n\t" |
19 | | - "subs r4, r4, #1 @ \n\t" |
20 | | - "vst1.32 {d24[0]}, [r0]! @ \n\t" |
21 | | - "bne 4b @ \n\t" |
22 | | - "5: @ " |
23 | | - : |
24 | | - :"r" (x), "r"(caddr),"r"(n) |
25 | | - : "cc", "r0", "r4", "memory", |
26 | | - "q12", |
27 | | - "d24", "d25", "d26", "d27" |
28 | | - ); |
| 2 | + long i = 0; |
| 3 | + |
| 4 | + for(; i < n-4; i += 4) |
| 5 | + { |
| 6 | + x[i] = c; |
| 7 | + x[i+1] = c; |
| 8 | + x[i+2] = c; |
| 9 | + x[i+3] = c; |
| 10 | + } |
| 11 | + |
| 12 | + for(; i < n; i++) |
| 13 | + x[i] = c; |
| 14 | + |
29 | 15 | } |
30 | 16 |
|
31 | 17 |
|
32 | 18 | static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) { |
33 | | - __asm__ __volatile__ ( |
34 | | - "mov r0, %2 @ \n\t" |
35 | | - "mov r1, %1 @ \n\t" |
36 | | - "mov r2, %0 @ \n\t" |
37 | | - "lsrs r4, %3, #3 @ \n\t" |
38 | | - "beq 3f @ \n\t" |
39 | | - "vld1.32 {d16-d19}, [r1]! @ \n\t" |
40 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
41 | | - "1: @ \n\t" |
42 | | - "vsub.f32 q12, q8, q0 @ \n\t" |
43 | | - "vsub.f32 q13, q9, q1 @ \n\t" |
44 | | - "subs r4, r4, #1 @ \n\t" |
45 | | - "beq 2f @ \n\t" |
46 | | - "vld1.32 {d16-d19}, [r1]! @ \n\t" |
47 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
48 | | - "vst1.32 {d24-d27}, [r2]! @ \n\t" |
49 | | - "b 1b @ \n\t" |
50 | | - "2: @ \n\t" |
51 | | - "vst1.32 {d24-d27}, [r2]! @ \n\t" |
52 | | - "3: @ \n\t" |
53 | | - "ands r4, %3, #7 @ \n\t" |
54 | | - "beq 5f @ \n\t" |
55 | | - "4: @ \n\t" |
56 | | - "subs r4, r4, #1 @ \n\t" |
57 | | - "vld1.32 {d16[0]}, [r1]! @ \n\t" |
58 | | - "vld1.32 {d0[0]}, [r0]! @ \n\t" |
59 | | - "vsub.f32 d24, d16, d0 @ \n\t" |
60 | | - "vst1.32 {d24[0]}, [r2]! @ \n\t" |
61 | | - "bne 4b @ \n\t" |
62 | | - "5: @ " |
63 | | - : |
64 | | - :"r" (z), "r" (x),"r" (y), "r"(n) |
65 | | - : "cc", "r0", "r1", "r2", "r4", "memory", |
66 | | - "q0", "q1", "q8", "q9", "q12", "q13", |
67 | | - "d0", "d1", "d2", "d3", |
68 | | - "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" |
69 | | - ); |
| 19 | + long i = 0; |
| 20 | + |
| 21 | + for(; i < n-4; i += 4) |
| 22 | + { |
| 23 | + z[i] = x[i] - y[i]; |
| 24 | + z[i+1] = x[i+1] - y[i+1]; |
| 25 | + z[i+2] = x[i+2] - y[i+2]; |
| 26 | + z[i+3] = x[i+3] - y[i+3]; |
| 27 | + } |
| 28 | + |
| 29 | + for(; i < n; i++) |
| 30 | + z[i] = x[i] - y[i]; |
| 31 | + |
70 | 32 | } |
71 | 33 |
|
72 | 34 |
|
73 | 35 | static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) { |
74 | | - float ctemp = c; |
75 | | - float * caddr = &ctemp; |
76 | | - __asm__ __volatile__ ( |
77 | | - "mov r0, %0 @ \n\t" |
78 | | - "mov r2, r0 @ \n\t" |
79 | | - "ldr r5, [%1] @ \n\t" |
80 | | - "vdup.32 q14, r5 @ \n\t" |
81 | | - "lsrs r5, %2, #5 @ \n\t" |
82 | | - "beq 3f @ \n\t" |
83 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
84 | | - "vld1.32 {d4-d7}, [r0]! @ \n\t" |
85 | | - "vld1.32 {d8-d11}, [r0]! @ \n\t" |
86 | | - "vld1.32 {d12-d15}, [r0]! @ \n\t" |
87 | | - "1: @ \n\t" |
88 | | - "vmul.f32 q0, q0, q14 @ \n\t" |
89 | | - "vmul.f32 q1, q1, q14 @ \n\t" |
90 | | - "vmul.f32 q2, q2, q14 @ \n\t" |
91 | | - "vmul.f32 q3, q3, q14 @ \n\t" |
92 | | - "vmul.f32 q4, q4, q14 @ \n\t" |
93 | | - "vmul.f32 q5, q5, q14 @ \n\t" |
94 | | - "vmul.f32 q6, q6, q14 @ \n\t" |
95 | | - "vmul.f32 q7, q7, q14 @ \n\t" |
96 | | - "subs r5, r5, #1 @ \n\t" |
97 | | - "beq 2f @ \n\t" |
98 | | - "vst1.32 {d0-d3}, [r2]! @ \n\t" |
99 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
100 | | - "vst1.32 {d4-d7}, [r2]! @ \n\t" |
101 | | - "vld1.32 {d4-d7}, [r0]! @ \n\t" |
102 | | - "vst1.32 {d8-d11}, [r2]! @ \n\t" |
103 | | - "vld1.32 {d8-d11}, [r0]! @ \n\t" |
104 | | - "vst1.32 {d12-d15}, [r2]! @ \n\t" |
105 | | - "vld1.32 {d12-d15}, [r0]! @ \n\t" |
106 | | - "b 1b @ \n\t" |
107 | | - "2: @ \n\t" |
108 | | - "vst1.32 {d0-d3}, [r2]! @ \n\t" |
109 | | - "vst1.32 {d4-d7}, [r2]! @ \n\t" |
110 | | - "vst1.32 {d8-d11}, [r2]! @ \n\t" |
111 | | - "vst1.32 {d12-d15}, [r2]! @ \n\t" |
112 | | - "3: @ \n\t" |
113 | | - "lsrs r5, %2, #4 @ \n\t" |
114 | | - "ands r5, r5, #1 @ \n\t" |
115 | | - "beq 4f @ \n\t" |
116 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
117 | | - "vld1.32 {d4-d7}, [r0]! @ \n\t" |
118 | | - "vmul.f32 q0, q0, q14 @ \n\t" |
119 | | - "vmul.f32 q1, q1, q14 @ \n\t" |
120 | | - "vmul.f32 q2, q2, q14 @ \n\t" |
121 | | - "vmul.f32 q3, q3, q14 @ \n\t" |
122 | | - "vst1.32 {d0-d3}, [r2]! @ \n\t" |
123 | | - "vst1.32 {d4-d7}, [r2]! @ \n\t" |
124 | | - "4: @ \n\t" |
125 | | - "lsrs r5, %2, #3 @ \n\t" |
126 | | - "ands r5, r5, #1 @ \n\t" |
127 | | - "beq 5f @ \n\t" |
128 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
129 | | - "vmul.f32 q0, q0, q14 @ \n\t" |
130 | | - "vmul.f32 q1, q1, q14 @ \n\t" |
131 | | - "vst1.32 {d0-d3}, [r2]! @ \n\t" |
132 | | - "5: @ \n\t" |
133 | | - "ands r5, %2, #7 @ \n\t" |
134 | | - "beq 7f @ \n\t" |
135 | | - "6: @ \n\t" |
136 | | - "subs r5, r5, #1 @ \n\t" |
137 | | - "vld1.32 d0[0], [r0]! @ \n\t" |
138 | | - "vmul.f32 d0, d0, d28 @ \n\t" |
139 | | - "vst1.32 d0[0], [r2]! @ \n\t" |
140 | | - "bne 6b @ \n\t" |
141 | | - "7: @ " |
142 | | - : |
143 | | - :"r" (y), "r"(caddr),"r"(n) |
144 | | - : "cc", "r0", "r2", "r5", "memory", |
145 | | - "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14", |
146 | | - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", |
147 | | - "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", |
148 | | - "d28", "d29" |
149 | | - ); |
| 36 | + long i = 0; |
150 | 37 |
|
| 38 | + for(; i < n-4; i +=4) |
| 39 | + { |
| 40 | + y[i] *= c; |
| 41 | + y[i+1] *= c; |
| 42 | + y[i+2] *= c; |
| 43 | + y[i+3] *= c; |
| 44 | + } |
| 45 | + |
| 46 | + for(; i < n; i++) |
| 47 | + y[i] *= c; |
151 | 48 | } |
152 | 49 |
|
153 | 50 | static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) { |
154 | | - __asm__ __volatile__ ( |
155 | | - "mov r0, %0 @ \n\t" |
156 | | - "mov r1, %1 @ \n\t" |
157 | | - "mov r2, r0 @ \n\t" |
158 | | - "lsrs r4, %2, #3 @ \n\t" |
159 | | - "beq 3f @ \n\t" |
160 | | - "vld1.32 {d16-d19}, [r1]! @ \n\t" |
161 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
162 | | - "1: @ \n\t" |
163 | | - "vmul.f32 q12, q8, q0 @ \n\t" |
164 | | - "vmul.f32 q13, q9, q1 @ \n\t" |
165 | | - "subs r4, r4, #1 @ \n\t" |
166 | | - "beq 2f @ \n\t" |
167 | | - "vld1.32 {d16-d19}, [r1]! @ \n\t" |
168 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
169 | | - "vst1.32 {d24-d27}, [r2]! @ \n\t" |
170 | | - "b 1b @ \n\t" |
171 | | - "2: @ \n\t" |
172 | | - "vst1.32 {d24-d27}, [r2]! @ \n\t" |
173 | | - "3: @ \n\t" |
174 | | - "ands r4, %2, #7 @ \n\t" |
175 | | - "beq 5f @ \n\t" |
176 | | - "4: @ \n\t" |
177 | | - "subs r4, r4, #1 @ \n\t" |
178 | | - "vld1.32 {d16[0]}, [r1]! @ \n\t" |
179 | | - "vld1.32 {d0[0]}, [r0]! @ \n\t" |
180 | | - "vmul.f32 q12, q8, q0 @ \n\t" |
181 | | - "vst1.32 {d24[0]}, [r2]! @ \n\t" |
182 | | - "bne 4b @ \n\t" |
183 | | - "5: @ " |
184 | | - : |
185 | | - :"r" (y),"r" (x),"r"(n) |
186 | | - : "cc", "r0", "r1", "r2", "r4", "memory", |
187 | | - "q0", "q1", "q8", "q9", "q12", "q13", |
188 | | - "d0", "d1", "d2", "d3", |
189 | | - "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" |
190 | | - ); |
| 51 | + long i = 0; |
| 52 | + |
| 53 | + for(; i < n-4; i += 4) |
| 54 | + { |
| 55 | + y[i] *= x[i]; |
| 56 | + y[i+1] *= x[i+1]; |
| 57 | + y[i+2] *= x[i+2]; |
| 58 | + y[i+3] *= x[i+3]; |
| 59 | + } |
| 60 | + |
| 61 | + for(; i < n; i++) |
| 62 | + y[i] *= x[i]; |
191 | 63 | } |
192 | 64 |
|
193 | 65 | static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) { |
194 | | - float ctemp = c; |
195 | | - float * caddr = &ctemp; |
196 | | - __asm__ __volatile__ ( |
197 | | - "mov r0, %0 @ \n\t" |
198 | | - "mov r1, %1 @ \n\t" |
199 | | - "mov r2, r0 @ \n\t" |
200 | | - "ldr r5, [%2] @ \n\t" |
201 | | - "vdup.32 q14, r5 @ \n\t" |
202 | | - "lsrs r5, %3, #4 @ \n\t" |
203 | | - "beq 3f @ \n\t" |
204 | | - "vld1.32 {d16-d19}, [r1]! @ \n\t" |
205 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
206 | | - "vld1.32 {d20-d23}, [r1]! @ \n\t" |
207 | | - "vld1.32 {d4-d7}, [r0]! @ \n\t" |
208 | | - "1: @ \n\t" |
209 | | - "vmla.f32 q0, q8, q14 @ \n\t" |
210 | | - "vmla.f32 q1, q9, q14 @ \n\t" |
211 | | - "vmla.f32 q2, q10, q14 @ \n\t" |
212 | | - "vmla.f32 q3, q11, q14 @ \n\t" |
213 | | - "subs r5, r5, #1 @ \n\t" |
214 | | - "beq 2f @ \n\t" |
215 | | - "vld1.32 {d16-d19}, [r1]! @ \n\t" |
216 | | - "vld1.32 {d20-d23}, [r1]! @ \n\t" |
217 | | - "vst1.32 {d0-d3}, [r2]! @ \n\t" |
218 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
219 | | - "vst1.32 {d4-d7}, [r2]! @ \n\t" |
220 | | - "vld1.32 {d4-d7}, [r0]! @ \n\t" |
221 | | - "b 1b @ \n\t" |
222 | | - "2: @ \n\t" |
223 | | - "vst1.32 {d0-d3}, [r2]! @ \n\t" |
224 | | - "vst1.32 {d4-d7}, [r2]! @ \n\t" |
225 | | - "3: @ \n\t" |
226 | | - "lsrs r5, %3, #3 @ \n\t" |
227 | | - "ands r5, #1 @ \n\t" |
228 | | - "beq 4f @ \n\t" |
229 | | - "vld1.32 {d16-d19}, [r1]! @ \n\t" |
230 | | - "vld1.32 {d0-d3}, [r0]! @ \n\t" |
231 | | - "vmla.f32 q0, q8, q14 @ \n\t" |
232 | | - "vmla.f32 q1, q9, q14 @ \n\t" |
233 | | - "vst1.32 {d0-d3}, [r2]! @ \n\t" |
234 | | - "4: @ \n\t" |
235 | | - "ands r5, %3, #7 @ \n\t" |
236 | | - "beq 6f @ \n\t" |
237 | | - "5: @ \n\t" |
238 | | - "subs r5, r5, #1 @ \n\t" |
239 | | - "vld1.32 {d16[0]}, [r1]! @ \n\t" |
240 | | - "vld1.32 {d0[0]}, [r0]! @ \n\t" |
241 | | - "vmla.f32 d0, d16, d28 @ \n\t" |
242 | | - "vst1.32 d0[0], [r2]! @ \n\t" |
243 | | - "bne 5b @ \n\t" |
244 | | - "6: @ " |
245 | | - : |
246 | | - :"r" (y),"r" (x), "r"(caddr),"r"(n) |
247 | | - : "cc", "r0", "r1", "r2", "r5", "memory", |
248 | | - "q0", "q1", "q2", "q3", "q14", |
249 | | - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", |
250 | | - "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29" |
251 | | - ); |
| 66 | + long i = 0; |
| 67 | + |
| 68 | + for(;i < n-4; i += 4) |
| 69 | + { |
| 70 | + y[i] += c * x[i]; |
| 71 | + y[i+1] += c * x[i+1]; |
| 72 | + y[i+2] += c * x[i+2]; |
| 73 | + y[i+3] += c * x[i+3]; |
| 74 | + } |
| 75 | + |
| 76 | + for(; i < n; i++) |
| 77 | + y[i] += c * x[i]; |
252 | 78 | } |
0 commit comments