// 在 ARMv7 架构中, 一共有16个128位寄存器,这个128位寄存器也称之为 Q 寄存器,一个128位寄存器又可以分为两个64位寄存器,即一共有32个64位寄存器,64位寄存器又称之为 D 寄存器。在ARMv8 架构中寄存器的数量相比 ARMv7 架构数量翻倍。
#ifdef __ARM__
static void neon_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz)
{
if (sz & 63)
sz = (sz & -64) + 64;
asm volatile("NEONCopyPLD: \n"
" VLDM %[src]!,{d0-d7} \n"
" VSTM %[dst]!,{d0-d7} \n"
" SUBS %[sz],%[sz],#0x40 \n"
" BGT NEONCopyPLD \n"
: [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
:
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
}
#endif
static void neon_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz)
{
int neonCopy = sz - sz % 64;
if (neonCopy > 0) {
int tempCount = neonCopy;
// 编译不通过,invalid symbal redefine
// 在 NEONCopyPLD后加 _%=
asm volatile("NEONCopyPLD: \n"
// d0-d7是64位寄存器
" VLDM %[src]!,{d0-d7} \n"
" VSTM %[dst]!,{d0-d7} \n"
" SUBS %[tempCount],%[tempCount],#0x40 \n"
" BGT NEONCopyPLD \n"
: [dst] "+r"(dst), [src] "+r"(src), [tempCount] "+r"(tempCount)::"d0", "d1", "d2", "d3", "d4",
"d5", "d6", "d7", "cc", "memory");
}
if (sz - neonCopy > 0) {
memcpy((void*)dst, (void*)src, sz - neonCopy);
}
}