Skip to content

neon实现memcpy

C++
// 在 ARMv7 架构中, 一共有16个128位寄存器,这个128位寄存器也称之为 Q 寄存器,一个128位寄存器又可以分为两个64位寄存器,即一共有32个64位寄存器,64位寄存器又称之为 D 寄存器。在ARMv8 架构中寄存器的数量相比 ARMv7 架构数量翻倍。

#ifdef __ARM__
static void neon_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz)
{
    if (sz & 63)
        sz = (sz & -64) + 64;
    asm volatile("NEONCopyPLD: \n"
                 " VLDM %[src]!,{d0-d7} \n"
                 " VSTM %[dst]!,{d0-d7} \n"
                 " SUBS %[sz],%[sz],#0x40 \n"
                 " BGT NEONCopyPLD \n"
                 : [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
                 :
                 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
}
#endif

static void neon_memcpy(volatile unsigned char* dst, volatile unsigned char* src, int sz)
{
    int neonCopy = sz - sz % 64;
    if (neonCopy > 0) {
        int tempCount = neonCopy;
        // 编译不通过,invalid symbal redefine
        // 在 NEONCopyPLD后加 _%=
        asm volatile("NEONCopyPLD: \n"
                     // d0-d7是64位寄存器
                     " VLDM %[src]!,{d0-d7} \n"
                     " VSTM %[dst]!,{d0-d7} \n"
                     " SUBS %[tempCount],%[tempCount],#0x40 \n"
                     " BGT NEONCopyPLD \n"
                     : [dst] "+r"(dst), [src] "+r"(src), [tempCount] "+r"(tempCount)::"d0", "d1", "d2", "d3", "d4",
                       "d5", "d6", "d7", "cc", "memory");
    }
    if (sz - neonCopy > 0) {
        memcpy((void*)dst, (void*)src, sz - neonCopy);
    }
}