477 lines
13 KiB
ArmAsm
Executable File
477 lines
13 KiB
ArmAsm
Executable File
/*
|
|
* Copyright (C) 2008 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <machine/cpu-features.h>
|
|
|
|
.text
|
|
.align
|
|
|
|
.global jpeg_idct_ifast
|
|
.func jpeg_idct_ifast
|
|
|
|
// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
|
|
|
|
// jpeg_idct_ifast (j_decompress_ptr cinfo,
|
|
// jpeg_component_info * compptr,
|
|
// short* coef_block,
|
|
// unsigned char* output_buf,
|
|
// int output_col)
|
|
|
|
#define local_TMP0123 sp
|
|
#define local_TMP0 [sp, #0]
|
|
#define local_TMP1 [sp, #4]
|
|
#define local_TMP2 [sp, #8]
|
|
#define local_TMP3 [sp, #12]
|
|
#define local_RANGE_TABLE [sp, #16]
|
|
#define local_OUTPUT_COL [sp, #20]
|
|
#define local_OUTPUT_BUF [sp, #24]
|
|
#define local_UNUSED [sp, #28]
|
|
#define off_WORKSPACE 32
|
|
#define local_WORKSPACE [sp, #offWORKSPACE]
|
|
#define local_SIZE (off_WORKSPACE + 8*8*4)
|
|
|
|
#define off_DECOMPRESS_range_limit_base 324
|
|
#define off_COMPINFO_quanttable 80
|
|
|
|
#define DCTSIZE 8
|
|
#define VY(x) ((x)*DCTSIZE*2)
|
|
#define QY(x) ((x)*DCTSIZE*4)
|
|
|
|
#define VX(x) ((x)*2)
|
|
#define QX(x) ((x)*4)
|
|
|
|
#define FIX_1_414213562 #362
|
|
#define FIX_1_082392200 #277
|
|
#define FIX_1_847759065 #473
|
|
#define FIX_2_613125930 #669
|
|
|
|
#define RANGE_MASK 1023
|
|
|
|
|
|
|
|
jpeg_idct_ifast:
|
|
PLD (r2, #0)
|
|
stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
|
|
ldr r4, [sp, #4*10]
|
|
sub sp, #local_SIZE
|
|
|
|
ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable
|
|
str r4, local_OUTPUT_COL
|
|
str r3, local_OUTPUT_BUF
|
|
ldr r5, [r0, #off_DECOMPRESS_range_limit_base]
|
|
add r5, r5, #128
|
|
str r5, local_RANGE_TABLE
|
|
mov fp, r2 // fp = coef_block
|
|
add ip, sp, #off_WORKSPACE
|
|
|
|
VLoopTail:
|
|
ldrsh r0, [fp, #VY(0)]
|
|
ldrsh r1, [fp, #VY(1)]
|
|
ldrsh r2, [fp, #VY(2)]
|
|
ldrsh r3, [fp, #VY(3)]
|
|
ldrsh r4, [fp, #VY(4)]
|
|
ldrsh r5, [fp, #VY(5)]
|
|
ldrsh r6, [fp, #VY(6)]
|
|
ldrsh r7, [fp, #VY(7)]
|
|
|
|
cmp r1, #0
|
|
orreqs r8, r2, r3
|
|
orreqs r8, r4, r5
|
|
orreqs r8, r6, r7
|
|
beq VLoopHeadZero
|
|
|
|
VLoopHead:
|
|
// tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0)
|
|
// tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4)
|
|
// tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2)
|
|
// tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6)
|
|
// tmp10 = tmp0 + tmp2 (r0)
|
|
// tmp11 = tmp0 - tmp2 (r4)
|
|
|
|
ldr r9, [r10, #QY(4)]
|
|
ldr r8, [r10, #QY(0)]
|
|
#if __ARM_HAVE_HALFWORD_MULTIPLY
|
|
smulbb r4, r9, r4
|
|
smlabb r0, r8, r0, r4
|
|
#else
|
|
mul r4, r9, r4
|
|
mul r0, r8, r0
|
|
add r0, r4
|
|
#endif
|
|
ldr r9, [r10, #QY(6)]
|
|
ldr r8, [r10, #QY(2)]
|
|
sub r4, r0, r4, lsl #1
|
|
#if __ARM_HAVE_HALFWORD_MULTIPLY
|
|
smulbb r6, r9, r6
|
|
smlabb r2, r8, r2, r6
|
|
#else
|
|
mul r6, r9, r6
|
|
mul r2, r8, r2
|
|
add r2, r6
|
|
#endif
|
|
|
|
// tmp13 = tmp1 + tmp3 (r2)
|
|
// tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6)
|
|
// FIX_1_4142... = 362 = 45*8 + 2
|
|
sub r6, r2, r6, lsl #1
|
|
mov r8, #360
|
|
add r8, r8, #2
|
|
mul r9, r6, r8
|
|
|
|
// tmp0 = tmp10 + tmp13; (r0)
|
|
// tmp3 = tmp10 - tmp13; (r8)
|
|
// tmp1 = tmp11 + tmp12; (r4)
|
|
// tmp2 = tmp11 - tmp12; (r6)
|
|
add r0, r0, r2
|
|
rsb r6, r2, r9, asr #8
|
|
sub r8, r0, r2, lsl #1
|
|
add r4, r4, r6
|
|
sub r6, r4, r6, lsl #1
|
|
|
|
stmia local_TMP0123, {r0, r4, r6, r8}
|
|
|
|
// NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
|
|
|
|
// odd part
|
|
// tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1)
|
|
// tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5)
|
|
// tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3)
|
|
// tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7)
|
|
// z13 = tmp6 + tmp5; (r0)
|
|
// z10 = tmp6 - tmp5; (r2)
|
|
// z11 = tmp4 + tmp7; (r4)
|
|
// z12 = tmp4 - tmp7; (r6)
|
|
|
|
ldr r2, [r10, #QY(1)]
|
|
ldr r9, [r10, #QY(5)]
|
|
#if __ARM_HAVE_HALFWORD_MULTIPLY
|
|
smulbb r1, r2, r1
|
|
#else
|
|
mul r1, r2, r1
|
|
#endif
|
|
ldr r2, [r10, #QY(3)]
|
|
#if __ARM_HAVE_HALFWORD_MULTIPLY
|
|
smulbb r5, r9, r5
|
|
#else
|
|
mul r5, r9, r5
|
|
#endif
|
|
ldr r9, [r10, #QY(7)]
|
|
#if __ARM_HAVE_HALFWORD_MULTIPLY
|
|
smlabb r0, r2, r3, r5
|
|
smlabb r4, r9, r7, r1
|
|
#else
|
|
mul r0, r2, r3
|
|
add r0, r5
|
|
mul r4, r9, r7
|
|
add r4, r1
|
|
#endif
|
|
rsb r2, r0, r5, lsl #1
|
|
rsb r6, r4, r1, lsl #1
|
|
|
|
// tmp7 = z11 + z13; (r7)
|
|
// tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
|
|
// FIX_... = 360 + 2
|
|
add r7, r4, r0
|
|
sub r1, r4, r0
|
|
mov r8, #360
|
|
add r8, r8, #2
|
|
mul r1, r8, r1
|
|
|
|
// z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
|
|
// tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
|
|
// tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
|
|
// FIX_1_8477... = 473 = 472 + 1
|
|
// FIX_1_082... = 277 = 276 + 1
|
|
// FIX_2_... = 669 = 668 + 1
|
|
add r8, r2, r6
|
|
mov r9, #472
|
|
mla r8, r9, r8, r8
|
|
mov r9, #276
|
|
mla r0, r6, r9, r6
|
|
mov r9, #668
|
|
mla r2, r9, r2, r2
|
|
sub r0, r0, r8
|
|
rsb r2, r2, r8
|
|
|
|
// tmp6 = tmp12 - tmp7; (r6)
|
|
// tmp5 = tmp11 - tmp6; (r5)
|
|
// tmp4 = tmp10 + tmp5; (r4)
|
|
rsb r6, r7, r2, asr #8
|
|
rsb r5, r6, r1, asr #8
|
|
add r4, r5, r0, asr #8
|
|
|
|
ldmia local_TMP0123, {r0, r1, r2, r3}
|
|
|
|
// wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
|
|
// wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
|
|
// wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
|
|
// wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
|
|
// wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
|
|
// wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
|
|
// wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
|
|
// wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
|
|
|
|
add r0, r0, r7
|
|
sub r7, r0, r7, lsl #1
|
|
add r1, r1, r6
|
|
sub r6, r1, r6, lsl #1
|
|
add r2, r2, r5
|
|
sub r5, r2, r5, lsl #1
|
|
sub r3, r3, r4
|
|
add r4, r3, r4, lsl #1
|
|
|
|
str r0, [ip, #QY(0)]
|
|
str r1, [ip, #QY(1)]
|
|
str r2, [ip, #QY(2)]
|
|
str r3, [ip, #QY(3)]
|
|
str r4, [ip, #QY(4)]
|
|
str r5, [ip, #QY(5)]
|
|
str r6, [ip, #QY(6)]
|
|
str r7, [ip, #QY(7)]
|
|
|
|
// inptr++; /* advance pointers to next column */
|
|
// quantptr++;
|
|
// wsptr++;
|
|
add fp, fp, #2
|
|
add r10, r10, #4
|
|
add ip, ip, #4
|
|
add r0, sp, #(off_WORKSPACE + 4*8)
|
|
cmp ip, r0
|
|
bne VLoopTail
|
|
|
|
|
|
|
|
HLoopStart:
|
|
// reset pointers
|
|
PLD (sp, #off_WORKSPACE)
|
|
add ip, sp, #off_WORKSPACE
|
|
ldr r10, local_RANGE_TABLE
|
|
|
|
HLoopTail:
|
|
// output = *output_buf++ + output_col
|
|
ldr r0, local_OUTPUT_BUF
|
|
ldr r1, local_OUTPUT_COL
|
|
ldr r2, [r0], #4
|
|
str r0, local_OUTPUT_BUF
|
|
add fp, r2, r1
|
|
|
|
PLD (ip, #32)
|
|
ldmia ip!, {r0-r7}
|
|
|
|
cmp r1, #0
|
|
orreqs r8, r2, r3
|
|
orreqs r8, r4, r5
|
|
orreqs r8, r6, r7
|
|
beq HLoopTailZero
|
|
|
|
HLoopHead:
|
|
// tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0)
|
|
// tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4)
|
|
add r0, r0, r4
|
|
sub r4, r0, r4, lsl #1
|
|
|
|
// tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2)
|
|
// tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6)
|
|
// FIX_... = 360 + 2
|
|
add r2, r2, r6
|
|
sub r6, r2, r6, lsl #1
|
|
mov r8, #360
|
|
add r8, r8, #2
|
|
mul r6, r8, r6
|
|
|
|
// tmp0 = tmp10 + tmp13; (r0)
|
|
// tmp3 = tmp10 - tmp13; (r8)
|
|
// tmp1 = tmp11 + tmp12; (r4)
|
|
// tmp2 = tmp11 - tmp12; (r6)
|
|
add r0, r0, r2
|
|
rsb r6, r2, r6, asr #8
|
|
sub r8, r0, r2, lsl #1
|
|
add r4, r4, r6
|
|
sub r6, r4, r6, lsl #1
|
|
|
|
stmia local_TMP0123, {r0, r4, r6, r8}
|
|
|
|
// Odd part
|
|
|
|
// z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0)
|
|
// z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2)
|
|
// z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4)
|
|
// z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6)
|
|
add r0, r5, r3
|
|
sub r2, r5, r3
|
|
add r4, r1, r7
|
|
sub r6, r1, r7
|
|
|
|
// tmp7 = z11 + z13; (r7)
|
|
// tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
|
|
// FIX_... = 360 + 2
|
|
add r7, r4, r0
|
|
sub r1, r4, r0
|
|
mov r8, #360
|
|
add r8, r8, #2
|
|
mul r1, r8, r1
|
|
|
|
// z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
|
|
// tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
|
|
// tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
|
|
// FIX_1_8477... = 473 = 472 + 1
|
|
// FIX_1_082... = 277 = 276 + 1
|
|
// FIX_2_... = 669 = 668 + 1
|
|
add r8, r2, r6
|
|
mov r9, #472
|
|
mla r8, r9, r8, r8
|
|
mov r9, #276
|
|
mla r0, r6, r9, r6
|
|
mov r9, #668
|
|
mla r2, r9, r2, r2
|
|
sub r0, r0, r8
|
|
sub r2, r8, r2
|
|
|
|
// tmp6 = tmp12 - tmp7; (r6)
|
|
// tmp5 = tmp11 - tmp6; (r5)
|
|
// tmp4 = tmp10 + tmp5; (r4)
|
|
rsb r6, r7, r2, asr #8
|
|
rsb r5, r6, r1, asr #8
|
|
add r4, r5, r0, asr #8
|
|
|
|
ldmia local_TMP0123, {r0, r1, r2, r3}
|
|
|
|
// outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
|
|
// outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
|
|
// outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
|
|
// outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
|
|
// outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
|
|
// outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
|
|
// outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
|
|
// outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
|
|
|
|
mov r8, #128
|
|
add r0, r0, r7
|
|
sub r7, r0, r7, lsl #1
|
|
add r0, r8, r0, asr #5
|
|
add r7, r8, r7, asr #5
|
|
add r1, r1, r6
|
|
sub r6, r1, r6, lsl #1
|
|
add r1, r8, r1, asr #5
|
|
add r6, r8, r6, asr #5
|
|
add r2, r2, r5
|
|
sub r5, r2, r5, lsl #1
|
|
add r2, r8, r2, asr #5
|
|
add r5, r8, r5, asr #5
|
|
sub r3, r3, r4
|
|
add r4, r3, r4, lsl #1
|
|
add r3, r8, r3, asr #5
|
|
add r4, r8, r4, asr #5
|
|
|
|
#if __ARM_ARCH__ >= 6
|
|
usat r0, #8, r0
|
|
usat r1, #8, r1
|
|
usat r2, #8, r2
|
|
usat r3, #8, r3
|
|
usat r4, #8, r4
|
|
usat r5, #8, r5
|
|
usat r6, #8, r6
|
|
usat r7, #8, r7
|
|
#else
|
|
cmp r0, #255
|
|
mvnhi r0, r0, asr #31
|
|
andhi r0, #255
|
|
cmp r7, #255
|
|
mvnhi r7, r7, asr #31
|
|
cmp r1, #255
|
|
mvnhi r1, r1, asr #31
|
|
andhi r1, #255
|
|
cmp r6, #255
|
|
mvnhi r6, r6, asr #31
|
|
andhi r6, #255
|
|
cmp r2, #255
|
|
mvnhi r2, r2, asr #31
|
|
andhi r2, #255
|
|
cmp r5, #255
|
|
mvnhi r5, r5, asr #31
|
|
andhi r5, #255
|
|
cmp r3, #255
|
|
mvnhi r3, r3, asr #31
|
|
cmp r4, #255
|
|
mvnhi r4, r4, asr #31
|
|
andhi r4, #255
|
|
#endif
|
|
|
|
// r3 r2 r1 r0
|
|
orr r0, r0, r1, lsl #8
|
|
orr r0, r0, r2, lsl #16
|
|
orr r0, r0, r3, lsl #24
|
|
|
|
// r7 r6 r5 r4
|
|
orr r1, r4, r5, lsl #8
|
|
orr r1, r1, r6, lsl #16
|
|
orr r1, r1, r7, lsl #24
|
|
stmia fp, {r0, r1}
|
|
|
|
add r0, sp, #(off_WORKSPACE + 8*8*4)
|
|
cmp ip, r0
|
|
bne HLoopTail
|
|
|
|
Exit:
|
|
add sp, sp, #local_SIZE
|
|
ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
|
|
bx lr
|
|
|
|
|
|
VLoopHeadZero:
|
|
// ok, all AC coefficients are 0
|
|
ldr r1, [r10, #QY(0)]
|
|
add fp, fp, #2
|
|
add r10, r10, #4
|
|
mul r0, r1, r0
|
|
str r0, [ip, #QY(0)]
|
|
str r0, [ip, #QY(1)]
|
|
str r0, [ip, #QY(2)]
|
|
str r0, [ip, #QY(3)]
|
|
str r0, [ip, #QY(4)]
|
|
str r0, [ip, #QY(5)]
|
|
str r0, [ip, #QY(6)]
|
|
str r0, [ip, #QY(7)]
|
|
add ip, ip, #4
|
|
add r0, sp, #(off_WORKSPACE + 4*8)
|
|
cmp ip, r0
|
|
beq HLoopStart
|
|
b VLoopTail
|
|
|
|
HLoopTailZero:
|
|
mov r0, r0, asr #5
|
|
add r0, #128
|
|
|
|
#if __ARM_ARCH__ >= 6
|
|
usat r0, #8, r0
|
|
#else
|
|
cmp r0, #255
|
|
mvnhi r0, r0, asr #31
|
|
andhi r0, r0, #255
|
|
#endif
|
|
|
|
orr r0, r0, lsl #8
|
|
orr r0, r0, lsl #16
|
|
mov r1, r0
|
|
stmia fp, {r0, r1}
|
|
|
|
add r0, sp, #(off_WORKSPACE + 64*4)
|
|
cmp ip, r0
|
|
beq Exit
|
|
b HLoopTail
|
|
|
|
.endfunc
|