droidVncServer/jni/jpeg/jidctfst.S

477 lines
13 KiB
ArmAsm
Executable File

/*
* Copyright (C) 2008 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <machine/cpu-features.h>
.text
.align
.global jpeg_idct_ifast
.func jpeg_idct_ifast
// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
// jpeg_idct_ifast (j_decompress_ptr cinfo,
// jpeg_component_info * compptr,
// short* coef_block,
// unsigned char* output_buf,
// int output_col)
#define local_TMP0123 sp
#define local_TMP0 [sp, #0]
#define local_TMP1 [sp, #4]
#define local_TMP2 [sp, #8]
#define local_TMP3 [sp, #12]
#define local_RANGE_TABLE [sp, #16]
#define local_OUTPUT_COL [sp, #20]
#define local_OUTPUT_BUF [sp, #24]
#define local_UNUSED [sp, #28]
#define off_WORKSPACE 32
#define local_WORKSPACE [sp, #offWORKSPACE]
#define local_SIZE (off_WORKSPACE + 8*8*4)
#define off_DECOMPRESS_range_limit_base 324
#define off_COMPINFO_quanttable 80
#define DCTSIZE 8
#define VY(x) ((x)*DCTSIZE*2)
#define QY(x) ((x)*DCTSIZE*4)
#define VX(x) ((x)*2)
#define QX(x) ((x)*4)
#define FIX_1_414213562 #362
#define FIX_1_082392200 #277
#define FIX_1_847759065 #473
#define FIX_2_613125930 #669
#define RANGE_MASK 1023
jpeg_idct_ifast:
PLD (r2, #0)
stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
ldr r4, [sp, #4*10]
sub sp, #local_SIZE
ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable
str r4, local_OUTPUT_COL
str r3, local_OUTPUT_BUF
ldr r5, [r0, #off_DECOMPRESS_range_limit_base]
add r5, r5, #128
str r5, local_RANGE_TABLE
mov fp, r2 // fp = coef_block
add ip, sp, #off_WORKSPACE
VLoopTail:
ldrsh r0, [fp, #VY(0)]
ldrsh r1, [fp, #VY(1)]
ldrsh r2, [fp, #VY(2)]
ldrsh r3, [fp, #VY(3)]
ldrsh r4, [fp, #VY(4)]
ldrsh r5, [fp, #VY(5)]
ldrsh r6, [fp, #VY(6)]
ldrsh r7, [fp, #VY(7)]
cmp r1, #0
orreqs r8, r2, r3
orreqs r8, r4, r5
orreqs r8, r6, r7
beq VLoopHeadZero
VLoopHead:
// tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0)
// tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4)
// tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2)
// tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6)
// tmp10 = tmp0 + tmp2 (r0)
// tmp11 = tmp0 - tmp2 (r4)
ldr r9, [r10, #QY(4)]
ldr r8, [r10, #QY(0)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
smulbb r4, r9, r4
smlabb r0, r8, r0, r4
#else
mul r4, r9, r4
mul r0, r8, r0
add r0, r4
#endif
ldr r9, [r10, #QY(6)]
ldr r8, [r10, #QY(2)]
sub r4, r0, r4, lsl #1
#if __ARM_HAVE_HALFWORD_MULTIPLY
smulbb r6, r9, r6
smlabb r2, r8, r2, r6
#else
mul r6, r9, r6
mul r2, r8, r2
add r2, r6
#endif
// tmp13 = tmp1 + tmp3 (r2)
// tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6)
// FIX_1_4142... = 362 = 45*8 + 2
sub r6, r2, r6, lsl #1
mov r8, #360
add r8, r8, #2
mul r9, r6, r8
// tmp0 = tmp10 + tmp13; (r0)
// tmp3 = tmp10 - tmp13; (r8)
// tmp1 = tmp11 + tmp12; (r4)
// tmp2 = tmp11 - tmp12; (r6)
add r0, r0, r2
rsb r6, r2, r9, asr #8
sub r8, r0, r2, lsl #1
add r4, r4, r6
sub r6, r4, r6, lsl #1
stmia local_TMP0123, {r0, r4, r6, r8}
// NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
// odd part
// tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1)
// tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5)
// tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3)
// tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7)
// z13 = tmp6 + tmp5; (r0)
// z10 = tmp6 - tmp5; (r2)
// z11 = tmp4 + tmp7; (r4)
// z12 = tmp4 - tmp7; (r6)
ldr r2, [r10, #QY(1)]
ldr r9, [r10, #QY(5)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
smulbb r1, r2, r1
#else
mul r1, r2, r1
#endif
ldr r2, [r10, #QY(3)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
smulbb r5, r9, r5
#else
mul r5, r9, r5
#endif
ldr r9, [r10, #QY(7)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
smlabb r0, r2, r3, r5
smlabb r4, r9, r7, r1
#else
mul r0, r2, r3
add r0, r5
mul r4, r9, r7
add r4, r1
#endif
rsb r2, r0, r5, lsl #1
rsb r6, r4, r1, lsl #1
// tmp7 = z11 + z13; (r7)
// tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
// FIX_... = 360 + 2
add r7, r4, r0
sub r1, r4, r0
mov r8, #360
add r8, r8, #2
mul r1, r8, r1
// z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
// tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
// tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
// FIX_1_8477... = 473 = 472 + 1
// FIX_1_082... = 277 = 276 + 1
// FIX_2_... = 669 = 668 + 1
add r8, r2, r6
mov r9, #472
mla r8, r9, r8, r8
mov r9, #276
mla r0, r6, r9, r6
mov r9, #668
mla r2, r9, r2, r2
sub r0, r0, r8
rsb r2, r2, r8
// tmp6 = tmp12 - tmp7; (r6)
// tmp5 = tmp11 - tmp6; (r5)
// tmp4 = tmp10 + tmp5; (r4)
rsb r6, r7, r2, asr #8
rsb r5, r6, r1, asr #8
add r4, r5, r0, asr #8
ldmia local_TMP0123, {r0, r1, r2, r3}
// wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
// wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
// wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
// wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
// wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
// wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
// wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
// wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
add r0, r0, r7
sub r7, r0, r7, lsl #1
add r1, r1, r6
sub r6, r1, r6, lsl #1
add r2, r2, r5
sub r5, r2, r5, lsl #1
sub r3, r3, r4
add r4, r3, r4, lsl #1
str r0, [ip, #QY(0)]
str r1, [ip, #QY(1)]
str r2, [ip, #QY(2)]
str r3, [ip, #QY(3)]
str r4, [ip, #QY(4)]
str r5, [ip, #QY(5)]
str r6, [ip, #QY(6)]
str r7, [ip, #QY(7)]
// inptr++; /* advance pointers to next column */
// quantptr++;
// wsptr++;
add fp, fp, #2
add r10, r10, #4
add ip, ip, #4
add r0, sp, #(off_WORKSPACE + 4*8)
cmp ip, r0
bne VLoopTail
HLoopStart:
// reset pointers
PLD (sp, #off_WORKSPACE)
add ip, sp, #off_WORKSPACE
ldr r10, local_RANGE_TABLE
HLoopTail:
// output = *output_buf++ + output_col
ldr r0, local_OUTPUT_BUF
ldr r1, local_OUTPUT_COL
ldr r2, [r0], #4
str r0, local_OUTPUT_BUF
add fp, r2, r1
PLD (ip, #32)
ldmia ip!, {r0-r7}
cmp r1, #0
orreqs r8, r2, r3
orreqs r8, r4, r5
orreqs r8, r6, r7
beq HLoopTailZero
HLoopHead:
// tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0)
// tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4)
add r0, r0, r4
sub r4, r0, r4, lsl #1
// tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2)
// tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6)
// FIX_... = 360 + 2
add r2, r2, r6
sub r6, r2, r6, lsl #1
mov r8, #360
add r8, r8, #2
mul r6, r8, r6
// tmp0 = tmp10 + tmp13; (r0)
// tmp3 = tmp10 - tmp13; (r8)
// tmp1 = tmp11 + tmp12; (r4)
// tmp2 = tmp11 - tmp12; (r6)
add r0, r0, r2
rsb r6, r2, r6, asr #8
sub r8, r0, r2, lsl #1
add r4, r4, r6
sub r6, r4, r6, lsl #1
stmia local_TMP0123, {r0, r4, r6, r8}
// Odd part
// z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0)
// z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2)
// z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4)
// z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6)
add r0, r5, r3
sub r2, r5, r3
add r4, r1, r7
sub r6, r1, r7
// tmp7 = z11 + z13; (r7)
// tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
// FIX_... = 360 + 2
add r7, r4, r0
sub r1, r4, r0
mov r8, #360
add r8, r8, #2
mul r1, r8, r1
// z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
// tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
// tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
// FIX_1_8477... = 473 = 472 + 1
// FIX_1_082... = 277 = 276 + 1
// FIX_2_... = 669 = 668 + 1
add r8, r2, r6
mov r9, #472
mla r8, r9, r8, r8
mov r9, #276
mla r0, r6, r9, r6
mov r9, #668
mla r2, r9, r2, r2
sub r0, r0, r8
sub r2, r8, r2
// tmp6 = tmp12 - tmp7; (r6)
// tmp5 = tmp11 - tmp6; (r5)
// tmp4 = tmp10 + tmp5; (r4)
rsb r6, r7, r2, asr #8
rsb r5, r6, r1, asr #8
add r4, r5, r0, asr #8
ldmia local_TMP0123, {r0, r1, r2, r3}
// outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
// outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
// outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
// outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
// outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
// outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
// outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
// outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
mov r8, #128
add r0, r0, r7
sub r7, r0, r7, lsl #1
add r0, r8, r0, asr #5
add r7, r8, r7, asr #5
add r1, r1, r6
sub r6, r1, r6, lsl #1
add r1, r8, r1, asr #5
add r6, r8, r6, asr #5
add r2, r2, r5
sub r5, r2, r5, lsl #1
add r2, r8, r2, asr #5
add r5, r8, r5, asr #5
sub r3, r3, r4
add r4, r3, r4, lsl #1
add r3, r8, r3, asr #5
add r4, r8, r4, asr #5
#if __ARM_ARCH__ >= 6
usat r0, #8, r0
usat r1, #8, r1
usat r2, #8, r2
usat r3, #8, r3
usat r4, #8, r4
usat r5, #8, r5
usat r6, #8, r6
usat r7, #8, r7
#else
cmp r0, #255
mvnhi r0, r0, asr #31
andhi r0, #255
cmp r7, #255
mvnhi r7, r7, asr #31
cmp r1, #255
mvnhi r1, r1, asr #31
andhi r1, #255
cmp r6, #255
mvnhi r6, r6, asr #31
andhi r6, #255
cmp r2, #255
mvnhi r2, r2, asr #31
andhi r2, #255
cmp r5, #255
mvnhi r5, r5, asr #31
andhi r5, #255
cmp r3, #255
mvnhi r3, r3, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
andhi r4, #255
#endif
// r3 r2 r1 r0
orr r0, r0, r1, lsl #8
orr r0, r0, r2, lsl #16
orr r0, r0, r3, lsl #24
// r7 r6 r5 r4
orr r1, r4, r5, lsl #8
orr r1, r1, r6, lsl #16
orr r1, r1, r7, lsl #24
stmia fp, {r0, r1}
add r0, sp, #(off_WORKSPACE + 8*8*4)
cmp ip, r0
bne HLoopTail
Exit:
add sp, sp, #local_SIZE
ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
bx lr
VLoopHeadZero:
// ok, all AC coefficients are 0
ldr r1, [r10, #QY(0)]
add fp, fp, #2
add r10, r10, #4
mul r0, r1, r0
str r0, [ip, #QY(0)]
str r0, [ip, #QY(1)]
str r0, [ip, #QY(2)]
str r0, [ip, #QY(3)]
str r0, [ip, #QY(4)]
str r0, [ip, #QY(5)]
str r0, [ip, #QY(6)]
str r0, [ip, #QY(7)]
add ip, ip, #4
add r0, sp, #(off_WORKSPACE + 4*8)
cmp ip, r0
beq HLoopStart
b VLoopTail
HLoopTailZero:
mov r0, r0, asr #5
add r0, #128
#if __ARM_ARCH__ >= 6
usat r0, #8, r0
#else
cmp r0, #255
mvnhi r0, r0, asr #31
andhi r0, r0, #255
#endif
orr r0, r0, lsl #8
orr r0, r0, lsl #16
mov r1, r0
stmia fp, {r0, r1}
add r0, sp, #(off_WORKSPACE + 64*4)
cmp ip, r0
beq Exit
b HLoopTail
.endfunc