author | Timothy B. Terriberry <tterribe@vt.edu> |
Thu, 14 Apr 2011 17:34:18 -0700 | |
changeset 68192 | e05cdd49d004d542a232edd51e13e5a9e68668d8 |
parent 68191 | e957f873a56524345a2e5f7251735f949ee5b7e5 |
child 68193 | b92ca278fe217c169c2025ae241a07bd023689b7 |
push id | 19535 |
push user | Ms2ger@gmail.com |
push date | Sat, 16 Apr 2011 09:25:05 +0000 |
treeherder | mozilla-central@bee1149208a9 [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | jbramley |
bugs | 634557 |
milestone | 6.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/gfx/ycbcr/Makefile.in +++ b/gfx/ycbcr/Makefile.in @@ -71,18 +71,22 @@ else CPPSRCS += yuv_row_other.cpp \ $(NULL) endif # Darwin endif # SunOS endif # linux endif # windows ifeq (arm,$(findstring arm,$(OS_TEST))) +ifdef HAVE_ARM_NEON CPPSRCS += yuv_convert_arm.cpp \ $(NULL) +ASFILES = yuv_row_arm.$(ASM_SUFFIX) \ + $(NULL) +endif endif EXTRA_DSO_LDOPTS += \ $(LIBS_DIR) \ $(EXTRA_DSO_LIBS) \ $(XPCOM_LIBS) \ $(NSPR_LIBS) \ $(NULL)
--- a/gfx/ycbcr/ycbcr_to_rgb565.cpp +++ b/gfx/ycbcr/ycbcr_to_rgb565.cpp @@ -96,16 +96,19 @@ typedef void (*yuv2rgb565_row_scale_bili typedef void (*yuv2rgb565_row_scale_nearest_func)( const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither); # if defined(MOZILLA_MAY_SUPPORT_NEON) +extern "C" void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( + const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); + void __attribute((noinline)) yuv42x_to_rgb565_row_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag); #endif @@ -362,16 +365,25 @@ NS_GFX_(void) ScaleYCbCrToRGB565(const P padded reference frames). In practice, we do not even _have_ the actual bounds of the source, as we are passed a crop rectangle from it, and not the dimensions of the full image. This assertion will not guarantee our out-of-bounds reads are safe, but it should at least catch the simple case of passing in an unpadded buffer.*/ NS_ASSERTION(abs(y_pitch) >= abs(source_width)+16, "ScaleYCbCrToRGB565 source image unpadded?"); + /*The NEON code requires the pointers to be aligned to a 16-byte boundary at + the start of each row. + This should be true for all of our sources. + We could try to fix this up if it's not true by adjusting source_x0, but + that would require the mis-alignment to be the same for the U and V + planes.*/ + NS_ASSERTION((y_pitch&15) == 0 && (uv_pitch&15) == 0 && + ((y_buf-NULL)&15) == 0 && ((u_buf-NULL)&15) == 0 && ((v_buf-NULL)&15) == 0, + "ScaleYCbCrToRGB565 source image unaligned"); /*We take an area-based approach to pixel coverage to avoid shifting by small amounts (or not so small, when up-scaling or down-scaling by a large factor). An illustrative example: scaling 4:2:0 up by 2, using JPEG chroma cositing^. + = RGB destination locations * = Y' source locations @@ -488,18 +500,23 @@ NS_GFX_(void) ScaleYCbCrToRGB565(const P CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; if (uvxscale_min <= abs(source_dx_q16) && abs(source_dx_q16) <= uvxscale_max && uvyscale_min <= abs(source_dy_q16) && abs(source_dy_q16) <= uvyscale_max) { /*Add the rounding offsets now.*/ source_uv_xoffs_q16 += 1<<(15+x_shift); source_uv_yoffs_q16 += 1<<(15+y_shift); - if (yuv_type != YV24) - scale_row = ScaleYCbCr42xToRGB565_BilinearY_Row_C; + if (yuv_type != YV24) { + scale_row = +# if defined(MOZILLA_MAY_SUPPORT_NEON) + supports_neon() ? ScaleYCbCr42xToRGB565_BilinearY_Row_NEON : +# endif + ScaleYCbCr42xToRGB565_BilinearY_Row_C; + } else scale_row = ScaleYCbCr444ToRGB565_BilinearY_Row_C; } else { if (yuv_type == YV12) scale_row = ScaleYCbCr420ToRGB565_Bilinear_Row_C; else if (yuv_type == YV16) scale_row = ScaleYCbCr422ToRGB565_Bilinear_Row_C; @@ -554,16 +571,46 @@ NS_GFX_(bool) IsScaleYCbCrToRGB565Fast(i int source_y0, int source_width, int source_height, int width, int height, YUVType yuv_type, ScaleFilter filter) { + // Very fast. + if (width <= 0 || height <= 0) + return true; +# if defined(MOZILLA_MAY_SUPPORT_NEON) + if (filter != FILTER_NONE) { + int source_dx_q16; + int source_dy_q16; + int uvxscale_min; + int uvxscale_max; + int uvyscale_min; + int uvyscale_max; + source_dx_q16 = (source_width<<16) / width; + source_dy_q16 = (source_height<<16) / height; + uvxscale_min = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvxscale_max = yuv_type != YV24 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + uvyscale_min = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN; + uvyscale_max = yuv_type == YV12 ? + CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX; + if (uvxscale_min <= abs(source_dx_q16) + && abs(source_dx_q16) <= uvxscale_max + && uvyscale_min <= abs(source_dy_q16) + && abs(source_dy_q16) <= uvyscale_max) { + if (yuv_type != YV24) + return supports_neon(); + } + } +# endif return false; } void yuv_to_rgb565_row_c(uint16 *dst, const uint8 *y, const uint8 *u,
new file mode 100644 --- /dev/null +++ b/gfx/ycbcr/yuv_row_arm.s @@ -0,0 +1,329 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is NEON YUV-to-RGB565 scaling code. + * + * The Initial Developer of the Original Code is the Mozilla Foundation. + * Portions created by the Initial Developer are Copyright (C) 2011 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Timothy B. Terriberry <tterriberry@mozilla.com> + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + + .arch armv7-a + .fpu neon + .text + .align + + .balign 64 +YCbCr42xToRGB565_DITHER03_CONSTS_NEON: + .short -14240 + .short -14240+384 + .short 8672 + .short 8672+192 + .short -17696 + .short -17696+384 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER12_CONSTS_NEON: + .short -14240+128 + .short -14240+256 + .short 8672+64 + .short 8672+128 + .short -17696+128 + .short -17696+256 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER21_CONSTS_NEON: + .short -14240+256 + .short -14240+128 + .short 8672+128 + .short 8672+64 + .short -17696+256 + .short -17696+128 + .byte 102 + .byte 25 + .byte 52 + .byte 129 +YCbCr42xToRGB565_DITHER30_CONSTS_NEON: + .short -14240+384 + .short -14240 + .short 8672+192 + .short 8672 + .short -17696+384 + .short -17696 + .byte 102 + .byte 25 + .byte 52 + .byte 129 + +@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( +@ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); +@ +@ ctx = { +@ PRUint16 *rgb_row; /*r0*/ +@ const PRUint8 *y_row; /*r1*/ +@ const PRUint8 *u_row; /*r2*/ +@ const PRUint8 *v_row; /*r3*/ +@ int y_yweight; /*r4*/ +@ int y_pitch; /*r5*/ +@ int width; /*r6*/ +@ int source_x0_q16; /*r7*/ +@ int source_dx_q16; /*r8*/ +@ int source_uv_xoffs_q16; /*r9*/ +@ }; + .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON + .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function + .balign 64 + .fnstart +ScaleYCbCr42xToRGB565_BilinearY_Row_NEON: + STMFD r13!,{r4-r9,r14} @ 8 words. + ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON + VPUSH {Q4-Q7} @ 16 words. + ADD r14,r14,r1, LSL #4 @ Select the dither table to use + LDMIA r0, {r0-r9} + @ Set up image index registers. + ADD r12,r8, r8 + VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16 + VDUP.32 D17,r12 + ADD r12,r12,r12 + VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16 + VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16 + ADD r12,r12,r12 + VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16 + VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16 + CMP r8, #0 @ If source_dx_q16 is negative... + VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16 + ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block + VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16 + SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use) + VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16 + VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16 + VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16 + VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16 + VLD1.64 {D30,D31},[r14,:128] @ Load some constants + VMOV.I8 D28,#52 + VMOV.I8 D29,#129 + @ The basic idea here is to do aligned loads of a block of data and then + @ index into it using VTBL to extract the data from the source X + @ coordinate corresponding to each destination pixel. + @ This is significantly less code and significantly fewer cycles than doing + @ a series of single-lane loads, but it means that the X step between + @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee + @ that we could read 8 pixels from a single aligned 32-byte block of data. + @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel, + @ separated into even pixels and odd pixels to make extracting offsets and + @ weights easier. + @ We then pull out two bytes from the middle of each coordinate: the top + @ byte corresponds to the integer part of the X coordinate, and the bottom + @ byte corresponds to the weight to use for bilinear blending. + @ These are separated out into different registers with VTRN. + @ Then by subtracting the integer X coordinate of the first pixel in the + @ data block we loaded, we produce an index register suitable for use by + @ VTBL. +s42xbily_neon_loop: + @ Load the Y' data. + MOV r12,r7, ASR #16 + VRSHRN.S32 D16,Q0, #8 + AND r12,r12,#~15 @ Read 16-byte aligned blocks + VDUP.I8 D20,r12 + ADD r12,r1, r12 @ r12 = y_row+(source_x&~7) + VRSHRN.S32 D17,Q1, #8 + PLD [r12,#64] + VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row + ADD r14,r7, r8, LSL #3 + VRSHRN.S32 D18,Q2, #8 + MOV r14,r14,ASR #16 + VRSHRN.S32 D19,Q3, #8 + AND r14,r14,#~15 @ Read 16-byte aligned blocks + VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row + PLD [r12,#64] + VDUP.I8 D21,r14 + ADD r14,r1, r14 @ r14 = y_row+(source_x&~7) + VMOV.I8 Q13,#1 + PLD [r14,#64] + VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> + @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> + VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded. + @ First 8 Y' pixels + VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x + VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x + VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x + VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1 + VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1 + @ Next 8 Y' pixels + VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row + PLD [r14,#64] + VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x + VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x + VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1 + VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1 + @ Blend Y'. + VDUP.I16 Q9, r4 @ Load the y weights. + VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a + VSUBL.U8 Q5, D25,D21 + VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b + VSUBL.U8 Q7, D27,D23 + VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight + VMUL.S16 Q5, Q5, Q9 + VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight + VMUL.S16 Q7, Q7, Q9 + VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits. + VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW. + VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8 + VRSHRN.S16 D9, Q5, #8 + VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8 + VRSHRN.S16 D13,Q7, #8 + VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8) + VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8) + VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a + VSUBL.U8 Q5, D23,D21 + VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight + VMUL.S16 Q5, Q5, Q13 + VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8 + ADD r12,r7, r9 + VRSHRN.S16 D9, Q5, #8 + MOV r12,r12,ASR #17 + VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8) + @ Start extracting the chroma x coordinates, and load Cb and Cr. + AND r12,r12,#~15 @ Read 16-byte aligned blocks + VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4 + ADD r14,r2, r12 + VADD.I32 Q10,Q0, Q9 + VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb + PLD [r14,#64] + VADD.I32 Q11,Q1, Q9 + ADD r14,r3, r12 + VADD.I32 Q12,Q2, Q9 + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr + PLD [r14,#64] + VADD.I32 Q13,Q3, Q9 + VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0> + VRSHRN.S32 D21,Q11,#9 + VDUP.I8 Q9, r12 + VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1> + VRSHRN.S32 D23,Q13,#9 + @ We don't actually need the x weights, but we get them for free. + @ Free ALU slot + VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> + @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> + VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded. + VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x + VMOV.I8 D24,#74 + VTBL.8 D19,{D8, D9, D10,D11},D23 + VMOV.I8 D26,#102 + VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x + VMOV.I8 D27,#25 + VTBL.8 D21,{D12,D13,D14,D15},D23 + @ We now have Y' in Q8, Cb in Q9, and Cr in Q10 + @ We use VDUP to expand constants, because it's a permute instruction, so + @ it can dual issue on the A8. + SUBS r6, r6, #16 @ width -= 16 + VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74 + VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G + VMULL.U8 Q5, D17,D24 + VDUP.32 Q7, D30[1] + VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G + VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R + VMLSL.U8 Q7, D19,D27 + VDUP.32 Q12,D30[0] + VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R + VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B + VMLAL.U8 Q12,D21,D26 + VDUP.32 Q13,D31[0] + VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B + VMLAL.U8 Q13,D19,D29 + VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G + VMLSL.U8 Q7, D21,D28 + VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R + VADD.S16 Q12,Q5, Q12 + VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B + VQADD.S16 Q13,Q5, Q13 + VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G + VADD.S16 Q7, Q5, Q7 + @ Push each value to the top of its word and saturate it. + VQSHLU.S16 Q11,Q11,#2 + VQSHLU.S16 Q12,Q12,#2 + VQSHLU.S16 Q6, Q6, #2 + VQSHLU.S16 Q7, Q7, #2 + VQSHLU.S16 Q8, Q8, #2 + VQSHLU.S16 Q13,Q13,#2 + @ Merge G and B into R. + VSRI.U16 Q11,Q6, #5 + VSRI.U16 Q12,Q7, #5 + VSRI.U16 Q11,Q8, #11 + MOV r14,r8, LSL #4 + VSRI.U16 Q12,Q13,#11 + BLT s42xbily_neon_tail + VDUP.I32 Q13,r14 + @ Store the result. + VST1.16 {D22,D23,D24,D25},[r0]! + BEQ s42xbily_neon_done + @ Advance the x coordinates. + VADD.I32 Q0, Q0, Q13 + VADD.I32 Q1, Q1, Q13 + ADD r7, r14 + VADD.I32 Q2, Q2, Q13 + VADD.I32 Q3, Q3, Q13 + B s42xbily_neon_loop +s42xbily_neon_tail: + @ We have between 1 and 15 pixels left to write. + @ -r6 == the number of pixels we need to skip writing. + @ Adjust r0 to point to the last one we need to write, because we're going + @ to write them in reverse order. + ADD r0, r0, r6, LSL #1 + MOV r14,#-2 + ADD r0, r0, #30 + @ Skip past the ones we don't need to write. + SUB PC, PC, r6, LSL #2 + ORR r0, r0, r0 + VST1.16 {D25[3]},[r0,:16],r14 + VST1.16 {D25[2]},[r0,:16],r14 + VST1.16 {D25[1]},[r0,:16],r14 + VST1.16 {D25[0]},[r0,:16],r14 + VST1.16 {D24[3]},[r0,:16],r14 + VST1.16 {D24[2]},[r0,:16],r14 + VST1.16 {D24[1]},[r0,:16],r14 + VST1.16 {D24[0]},[r0,:16],r14 + VST1.16 {D23[3]},[r0,:16],r14 + VST1.16 {D23[2]},[r0,:16],r14 + VST1.16 {D23[1]},[r0,:16],r14 + VST1.16 {D23[0]},[r0,:16],r14 + VST1.16 {D22[3]},[r0,:16],r14 + VST1.16 {D22[2]},[r0,:16],r14 + VST1.16 {D22[1]},[r0,:16],r14 + VST1.16 {D22[0]},[r0,:16] +s42xbily_neon_done: + VPOP {Q4-Q7} @ 16 words. + LDMFD r13!,{r4-r9,PC} @ 8 words. + .fnend + .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON