Skip to content

Commit a8a02ff

Browse files
committed
Fix compilation for ASIMD
On ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo Replace assembly with C Original authors: - @dusty-nv FindARM-patch.txt CMakeLists-patch.txt - @rtarquini NEON.c
1 parent 1a3920e commit a8a02ff

File tree

3 files changed

+75
-237
lines changed

3 files changed

+75
-237
lines changed

CMakeLists.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,13 @@ ENDIF (WITH_OPENMP)
6262

6363
# ARM specific flags
6464
FIND_PACKAGE(ARM)
65-
IF (NEON_FOUND)
65+
IF (ASIMD_FOUND)
66+
MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
67+
SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
68+
ELSEIF (NEON_FOUND)
6669
MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
6770
SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
68-
ENDIF (NEON_FOUND)
71+
ENDIF (ASIMD_FOUND)
6972
IF (CORTEXA8_FOUND)
7073
MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
7174
SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")

cmake/FindARM.cmake

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,15 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
1313
set(NEON_FOUND false CACHE BOOL "NEON available on host")
1414
ENDIF (NEON_TRUE)
1515

16+
# on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
17+
STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO})
18+
STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE)
19+
IF (ASIMD_TRUE)
20+
set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host")
21+
ELSE (ASIMD_TRUE)
22+
set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host")
23+
ENDIF (ASIMD_TRUE)
24+
1625
#Find the processor type (for now OMAP3 or OMAP4)
1726
STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
1827
STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)

vector/NEON.c

Lines changed: 61 additions & 235 deletions
Original file line numberDiff line numberDiff line change
@@ -1,252 +1,78 @@
11
static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
2-
float ctemp = c;
3-
float * caddr = &ctemp;
4-
__asm__ __volatile__ (
5-
"mov r0, %0 @ \n\t"
6-
"ldr r4, [%1] @ \n\t"
7-
"vdup.32 q12, r4 @ \n\t"
8-
"vdup.32 q13, r4 @ \n\t"
9-
"lsrs r4, %2, #3 @ \n\t"
10-
"beq 3f @ \n\t"
11-
"1: @ \n\t"
12-
"vst1.32 {d24-d27}, [r0]! @ \n\t"
13-
"subs r4, r4, #1 @ \n\t"
14-
"bne 1b @ \n\t"
15-
"3: @ \n\t"
16-
"ands r4, %2, #7 @ \n\t"
17-
"beq 5f @ \n\t"
18-
"4: @ \n\t"
19-
"subs r4, r4, #1 @ \n\t"
20-
"vst1.32 {d24[0]}, [r0]! @ \n\t"
21-
"bne 4b @ \n\t"
22-
"5: @ "
23-
:
24-
:"r" (x), "r"(caddr),"r"(n)
25-
: "cc", "r0", "r4", "memory",
26-
"q12",
27-
"d24", "d25", "d26", "d27"
28-
);
2+
long i = 0;
3+
4+
for(; i < n-4; i += 4)
5+
{
6+
x[i] = c;
7+
x[i+1] = c;
8+
x[i+2] = c;
9+
x[i+3] = c;
10+
}
11+
12+
for(; i < n; i++)
13+
x[i] = c;
14+
2915
}
3016

3117

3218
static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
33-
__asm__ __volatile__ (
34-
"mov r0, %2 @ \n\t"
35-
"mov r1, %1 @ \n\t"
36-
"mov r2, %0 @ \n\t"
37-
"lsrs r4, %3, #3 @ \n\t"
38-
"beq 3f @ \n\t"
39-
"vld1.32 {d16-d19}, [r1]! @ \n\t"
40-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
41-
"1: @ \n\t"
42-
"vsub.f32 q12, q8, q0 @ \n\t"
43-
"vsub.f32 q13, q9, q1 @ \n\t"
44-
"subs r4, r4, #1 @ \n\t"
45-
"beq 2f @ \n\t"
46-
"vld1.32 {d16-d19}, [r1]! @ \n\t"
47-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
48-
"vst1.32 {d24-d27}, [r2]! @ \n\t"
49-
"b 1b @ \n\t"
50-
"2: @ \n\t"
51-
"vst1.32 {d24-d27}, [r2]! @ \n\t"
52-
"3: @ \n\t"
53-
"ands r4, %3, #7 @ \n\t"
54-
"beq 5f @ \n\t"
55-
"4: @ \n\t"
56-
"subs r4, r4, #1 @ \n\t"
57-
"vld1.32 {d16[0]}, [r1]! @ \n\t"
58-
"vld1.32 {d0[0]}, [r0]! @ \n\t"
59-
"vsub.f32 d24, d16, d0 @ \n\t"
60-
"vst1.32 {d24[0]}, [r2]! @ \n\t"
61-
"bne 4b @ \n\t"
62-
"5: @ "
63-
:
64-
:"r" (z), "r" (x),"r" (y), "r"(n)
65-
: "cc", "r0", "r1", "r2", "r4", "memory",
66-
"q0", "q1", "q8", "q9", "q12", "q13",
67-
"d0", "d1", "d2", "d3",
68-
"d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
69-
);
19+
long i = 0;
20+
21+
for(; i < n-4; i += 4)
22+
{
23+
z[i] = x[i] - y[i];
24+
z[i+1] = x[i+1] - y[i+1];
25+
z[i+2] = x[i+2] - y[i+2];
26+
z[i+3] = x[i+3] - y[i+3];
27+
}
28+
29+
for(; i < n; i++)
30+
z[i] = x[i] - y[i];
31+
7032
}
7133

7234

7335
static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) {
74-
float ctemp = c;
75-
float * caddr = &ctemp;
76-
__asm__ __volatile__ (
77-
"mov r0, %0 @ \n\t"
78-
"mov r2, r0 @ \n\t"
79-
"ldr r5, [%1] @ \n\t"
80-
"vdup.32 q14, r5 @ \n\t"
81-
"lsrs r5, %2, #5 @ \n\t"
82-
"beq 3f @ \n\t"
83-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
84-
"vld1.32 {d4-d7}, [r0]! @ \n\t"
85-
"vld1.32 {d8-d11}, [r0]! @ \n\t"
86-
"vld1.32 {d12-d15}, [r0]! @ \n\t"
87-
"1: @ \n\t"
88-
"vmul.f32 q0, q0, q14 @ \n\t"
89-
"vmul.f32 q1, q1, q14 @ \n\t"
90-
"vmul.f32 q2, q2, q14 @ \n\t"
91-
"vmul.f32 q3, q3, q14 @ \n\t"
92-
"vmul.f32 q4, q4, q14 @ \n\t"
93-
"vmul.f32 q5, q5, q14 @ \n\t"
94-
"vmul.f32 q6, q6, q14 @ \n\t"
95-
"vmul.f32 q7, q7, q14 @ \n\t"
96-
"subs r5, r5, #1 @ \n\t"
97-
"beq 2f @ \n\t"
98-
"vst1.32 {d0-d3}, [r2]! @ \n\t"
99-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
100-
"vst1.32 {d4-d7}, [r2]! @ \n\t"
101-
"vld1.32 {d4-d7}, [r0]! @ \n\t"
102-
"vst1.32 {d8-d11}, [r2]! @ \n\t"
103-
"vld1.32 {d8-d11}, [r0]! @ \n\t"
104-
"vst1.32 {d12-d15}, [r2]! @ \n\t"
105-
"vld1.32 {d12-d15}, [r0]! @ \n\t"
106-
"b 1b @ \n\t"
107-
"2: @ \n\t"
108-
"vst1.32 {d0-d3}, [r2]! @ \n\t"
109-
"vst1.32 {d4-d7}, [r2]! @ \n\t"
110-
"vst1.32 {d8-d11}, [r2]! @ \n\t"
111-
"vst1.32 {d12-d15}, [r2]! @ \n\t"
112-
"3: @ \n\t"
113-
"lsrs r5, %2, #4 @ \n\t"
114-
"ands r5, r5, #1 @ \n\t"
115-
"beq 4f @ \n\t"
116-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
117-
"vld1.32 {d4-d7}, [r0]! @ \n\t"
118-
"vmul.f32 q0, q0, q14 @ \n\t"
119-
"vmul.f32 q1, q1, q14 @ \n\t"
120-
"vmul.f32 q2, q2, q14 @ \n\t"
121-
"vmul.f32 q3, q3, q14 @ \n\t"
122-
"vst1.32 {d0-d3}, [r2]! @ \n\t"
123-
"vst1.32 {d4-d7}, [r2]! @ \n\t"
124-
"4: @ \n\t"
125-
"lsrs r5, %2, #3 @ \n\t"
126-
"ands r5, r5, #1 @ \n\t"
127-
"beq 5f @ \n\t"
128-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
129-
"vmul.f32 q0, q0, q14 @ \n\t"
130-
"vmul.f32 q1, q1, q14 @ \n\t"
131-
"vst1.32 {d0-d3}, [r2]! @ \n\t"
132-
"5: @ \n\t"
133-
"ands r5, %2, #7 @ \n\t"
134-
"beq 7f @ \n\t"
135-
"6: @ \n\t"
136-
"subs r5, r5, #1 @ \n\t"
137-
"vld1.32 d0[0], [r0]! @ \n\t"
138-
"vmul.f32 d0, d0, d28 @ \n\t"
139-
"vst1.32 d0[0], [r2]! @ \n\t"
140-
"bne 6b @ \n\t"
141-
"7: @ "
142-
:
143-
:"r" (y), "r"(caddr),"r"(n)
144-
: "cc", "r0", "r2", "r5", "memory",
145-
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
146-
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
147-
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
148-
"d28", "d29"
149-
);
36+
long i = 0;
15037

38+
for(; i < n-4; i +=4)
39+
{
40+
y[i] *= c;
41+
y[i+1] *= c;
42+
y[i+2] *= c;
43+
y[i+3] *= c;
44+
}
45+
46+
for(; i < n; i++)
47+
y[i] *= c;
15148
}
15249

15350
static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) {
154-
__asm__ __volatile__ (
155-
"mov r0, %0 @ \n\t"
156-
"mov r1, %1 @ \n\t"
157-
"mov r2, r0 @ \n\t"
158-
"lsrs r4, %2, #3 @ \n\t"
159-
"beq 3f @ \n\t"
160-
"vld1.32 {d16-d19}, [r1]! @ \n\t"
161-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
162-
"1: @ \n\t"
163-
"vmul.f32 q12, q8, q0 @ \n\t"
164-
"vmul.f32 q13, q9, q1 @ \n\t"
165-
"subs r4, r4, #1 @ \n\t"
166-
"beq 2f @ \n\t"
167-
"vld1.32 {d16-d19}, [r1]! @ \n\t"
168-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
169-
"vst1.32 {d24-d27}, [r2]! @ \n\t"
170-
"b 1b @ \n\t"
171-
"2: @ \n\t"
172-
"vst1.32 {d24-d27}, [r2]! @ \n\t"
173-
"3: @ \n\t"
174-
"ands r4, %2, #7 @ \n\t"
175-
"beq 5f @ \n\t"
176-
"4: @ \n\t"
177-
"subs r4, r4, #1 @ \n\t"
178-
"vld1.32 {d16[0]}, [r1]! @ \n\t"
179-
"vld1.32 {d0[0]}, [r0]! @ \n\t"
180-
"vmul.f32 q12, q8, q0 @ \n\t"
181-
"vst1.32 {d24[0]}, [r2]! @ \n\t"
182-
"bne 4b @ \n\t"
183-
"5: @ "
184-
:
185-
:"r" (y),"r" (x),"r"(n)
186-
: "cc", "r0", "r1", "r2", "r4", "memory",
187-
"q0", "q1", "q8", "q9", "q12", "q13",
188-
"d0", "d1", "d2", "d3",
189-
"d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
190-
);
51+
long i = 0;
52+
53+
for(; i < n-4; i += 4)
54+
{
55+
y[i] *= x[i];
56+
y[i+1] *= x[i+1];
57+
y[i+2] *= x[i+2];
58+
y[i+3] *= x[i+3];
59+
}
60+
61+
for(; i < n; i++)
62+
y[i] *= x[i];
19163
}
19264

19365
static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
194-
float ctemp = c;
195-
float * caddr = &ctemp;
196-
__asm__ __volatile__ (
197-
"mov r0, %0 @ \n\t"
198-
"mov r1, %1 @ \n\t"
199-
"mov r2, r0 @ \n\t"
200-
"ldr r5, [%2] @ \n\t"
201-
"vdup.32 q14, r5 @ \n\t"
202-
"lsrs r5, %3, #4 @ \n\t"
203-
"beq 3f @ \n\t"
204-
"vld1.32 {d16-d19}, [r1]! @ \n\t"
205-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
206-
"vld1.32 {d20-d23}, [r1]! @ \n\t"
207-
"vld1.32 {d4-d7}, [r0]! @ \n\t"
208-
"1: @ \n\t"
209-
"vmla.f32 q0, q8, q14 @ \n\t"
210-
"vmla.f32 q1, q9, q14 @ \n\t"
211-
"vmla.f32 q2, q10, q14 @ \n\t"
212-
"vmla.f32 q3, q11, q14 @ \n\t"
213-
"subs r5, r5, #1 @ \n\t"
214-
"beq 2f @ \n\t"
215-
"vld1.32 {d16-d19}, [r1]! @ \n\t"
216-
"vld1.32 {d20-d23}, [r1]! @ \n\t"
217-
"vst1.32 {d0-d3}, [r2]! @ \n\t"
218-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
219-
"vst1.32 {d4-d7}, [r2]! @ \n\t"
220-
"vld1.32 {d4-d7}, [r0]! @ \n\t"
221-
"b 1b @ \n\t"
222-
"2: @ \n\t"
223-
"vst1.32 {d0-d3}, [r2]! @ \n\t"
224-
"vst1.32 {d4-d7}, [r2]! @ \n\t"
225-
"3: @ \n\t"
226-
"lsrs r5, %3, #3 @ \n\t"
227-
"ands r5, #1 @ \n\t"
228-
"beq 4f @ \n\t"
229-
"vld1.32 {d16-d19}, [r1]! @ \n\t"
230-
"vld1.32 {d0-d3}, [r0]! @ \n\t"
231-
"vmla.f32 q0, q8, q14 @ \n\t"
232-
"vmla.f32 q1, q9, q14 @ \n\t"
233-
"vst1.32 {d0-d3}, [r2]! @ \n\t"
234-
"4: @ \n\t"
235-
"ands r5, %3, #7 @ \n\t"
236-
"beq 6f @ \n\t"
237-
"5: @ \n\t"
238-
"subs r5, r5, #1 @ \n\t"
239-
"vld1.32 {d16[0]}, [r1]! @ \n\t"
240-
"vld1.32 {d0[0]}, [r0]! @ \n\t"
241-
"vmla.f32 d0, d16, d28 @ \n\t"
242-
"vst1.32 d0[0], [r2]! @ \n\t"
243-
"bne 5b @ \n\t"
244-
"6: @ "
245-
:
246-
:"r" (y),"r" (x), "r"(caddr),"r"(n)
247-
: "cc", "r0", "r1", "r2", "r5", "memory",
248-
"q0", "q1", "q2", "q3", "q14",
249-
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
250-
"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
251-
);
66+
long i = 0;
67+
68+
for(;i < n-4; i += 4)
69+
{
70+
y[i] += c * x[i];
71+
y[i+1] += c * x[i+1];
72+
y[i+2] += c * x[i+2];
73+
y[i+3] += c * x[i+3];
74+
}
75+
76+
for(; i < n; i++)
77+
y[i] += c * x[i];
25278
}

0 commit comments

Comments
 (0)