Details:
byteswap.h is fairly sub optimial. Can we replace the definition of the system
include to use
the following blackfin specific definition for the 32 bit case?
#define __bswap_32(X) ({ unsigned o;\
asm("i0=0;" \
"(r1,r0)=byteunpack r1:0;"\
"r0=pack(r0.l,r0.h);"\
"r1=pack(r1.l,r1.h);"\
"r0=bytepack (r1,r0);"\
: "=d" (o) : "q0" (X) : "R1","I0");
o; })
Currently we do this:
.align 4
.global _xxxx;
.type _xxxx, STT_FUNC;
_xxxx:
R1 = 255 (X);
R1 <<= 16;
R1 = R0 & R1;
R2 = R0 >> 24;
R1 >>= 8;
R2 = R2 | R1;
R1 = 65280 (Z);
R1 = R0 & R1;
R1 <<= 8;
R0 <<= 24;
R1 = R1 | R0;
LINK 0;
R2 = R2 | R1;
R0 = R2;
UNLINK;
rts;
.size _xxxx, .-_xxxx
Which is fairly suboptimal, the problem is libavutil depends on the
fact that if a vendor provides a byteswap header file it must be
optimial. So I can't optimize this bit in the video stack.
|
Details:
byteswap.h is fairly sub optimial. Can we replace the definition of the system
include to use
the following blackfin specific definition for the 32 bit case?
#define __bswap_32(X) ({ unsigned o;\
asm("i0=0;" \
"(r1,r0)=byteunpack r1:0;"\
"r0=pack(r0.l,r0.h);"\
"r1=pack(r1.l,r1.h);"\
"r0=bytepack (r1,r0);"\
: "=d" (o) : "q0" (X) : "R1","I0");
o; })
Currently we do this:
.align 4
.global _xxxx;
.type _xxxx, STT_FUNC;
_xxxx:
R1 = 255 (X);
R1 <<= 16;
R1 = R0 & R1;
R2 = R0 >> 24;
R1 >>= 8;
R2 = R2 | R1;
R1 = 65280 (Z);
R1 = R0 & R1;
R1 <<= 8;
R0 <<= 24;
R1 = R1 | R0;
LINK 0;
R2 = R2 | R1;
R0 = R2;
UNLINK;
rts;
.size _xxxx, .-_xxxx
Which is fairly suboptimal, the problem is libavutil depends on the
fact that if a vendor provides a byteswap header file it must be
optimial. So I can't optimize this bit in the video stack.
|