1// 2// MNNUnPackC4.S 3// MNN 4// 5// Created by MNN on 2019/02/02. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8 9#ifdef __arm__ 10#ifndef __aarch64__ 11 12#include "MNNAsmGlobal.h" 13.text 14.align 5 15 16.macro transpose 17vtrn.32 d0, d2 18vtrn.32 d1, d3 19vtrn.32 d4, d6 20vtrn.32 d5, d7 21vswp d1, d4 22vswp d3, d6 23.endm 24 25 26asm_function MNNUnpackC4 27//void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth); 28//Auto load: 29//r0:dst, r1:src, r2:area, r3:depth 30 31 32push {r4, r5, r6, r7, r8, lr} 33mul r4, r2, r3 34cmp r4, #0 35beq DownEnd 36 37//Swap r0 and r1 for conviniense 38mov r4, r0 39mov r0, r1 40mov r1, r4 41 42//r4: srcDepthOffset:area*sizeof(float) 43mov r4, #4 44mul r4, r2, r4 45 46DownL4: 47cmp r3, #3 48ble DownL3 49 50DownL4Loop: 51add r5, r1, r4 52add r6, r4, r5 53add r7, r4, r6 54mov r8, r2 55cmp r8, #3 56ble DownL4AreaRemain 57DownL4AreaLoop: 58vld1.32 {q0, q1}, [r0]! 59vld1.32 {q2, q3}, [r0]! 60transpose 61sub r8, r8, #4 62vst1.32 {q0}, [r1]! 63vst1.32 {q1}, [r5]! 64vst1.32 {q2}, [r6]! 65vst1.32 {q3}, [r7]! 66cmp r8, #4 67bge DownL4AreaLoop 68 69DownL4AreaRemain: 70cmp r8, #0 71beq DownL4AreaRemainEnd 72DownL4AreaRemainLoop: 73 74vld1.32 {q0}, [r0]! 75 76vst1.32 {d0[0]}, [r1]! 77vst1.32 {d0[1]}, [r5]! 78vst1.32 {d1[0]}, [r6]! 79vst1.32 {d1[1]}, [r7]! 80 81subs r8, r8, #1 82bne DownL4AreaRemainLoop 83DownL4AreaRemainEnd: 84sub r3, r3, #4 85mov r1, r7 86cmp r3, #4 87bge DownL4Loop 88 89DownL3: 90cmp r3, #2 91ble DownL2 92add r5, r1, r4 93add r6, r4, r5 94mov r8, r2 95cmp r8, #3 96ble DownL3AreaRemain 97DownL3AreaLoop: 98vld1.32 {q0, q1}, [r0]! 99vld1.32 {q2, q3}, [r0]! 100transpose 101sub r8, r8, #4 102vst1.32 {q0}, [r1]! 103vst1.32 {q1}, [r5]! 104vst1.32 {q2}, [r6]! 105cmp r8, #4 106bge DownL3AreaLoop 107 108cmp r8, #0 109beq DownL3AreaRemainEnd 110DownL3AreaRemain: 111vld1.32 {q0}, [r0]! 112 113vst1.32 {d0[0]}, [r1]! 114vst1.32 {d0[1]}, [r5]! 115vst1.32 {d1[0]}, [r6]! 116 117subs r8, r8, #1 118bne DownL3AreaRemain 119 120DownL3AreaRemainEnd: 121sub r3, r3, #3 122 123 124DownL2: 125cmp r3, #1 126ble DownL1 127add r5, r1, r4 128mov r8, r2 129cmp r8, #3 130ble DownL2AreaRemain 131DownL2AreaLoop: 132vld1.32 {q0, q1}, [r0]! 133vld1.32 {q2, q3}, [r0]! 134transpose 135vst1.32 {q0}, [r1]! 136vst1.32 {q1}, [r5]! 137sub r8, r8, #4 138cmp r8, #4 139bge DownL2AreaLoop 140 141cmp r8, #0 142beq DownL2AreaRemainEnd 143DownL2AreaRemain: 144vld1.32 {q0}, [r0]! 145vst1.32 {d0[0]}, [r1]! 146vst1.32 {d0[1]}, [r5]! 147 148subs r8, r8, #1 149bne DownL2AreaRemain 150 151DownL2AreaRemainEnd: 152sub r3, r3, #2 153 154DownL1: 155cmp r3, #0 156beq DownEnd 157mov r8, r2 158cmp r8, #3 159ble DownL1AreaRemain 160DownL1AreaLoop: 161vld1.32 {q0, q1}, [r0]! 162vld1.32 {q2, q3}, [r0]! 163transpose 164vst1.32 {q0}, [r1]! 165sub r8, r8, #4 166cmp r8, #4 167bge DownL1AreaLoop 168 169cmp r8, #0 170beq DownL1AreaRemainEnd 171DownL1AreaRemain: 172vld1.32 {q0}, [r0]! 173 174vst1.32 {d0[0]}, [r1]! 175subs r8, r8, #1 176bne DownL1AreaRemain 177 178DownL1AreaRemainEnd: 179 180DownEnd: 181 182 183 184pop {r4, r5, r6, r7, r8, pc} 185 186 187 188#endif 189#endif 190