Bug 634557 - Implement ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, r=jbramley
authorTimothy B. Terriberry <tterribe@vt.edu>
Thu, 14 Apr 2011 17:34:18 -0700
changeset 68192 e05cdd49d004d542a232edd51e13e5a9e68668d8
parent 68191 e957f873a56524345a2e5f7251735f949ee5b7e5
child 68193 b92ca278fe217c169c2025ae241a07bd023689b7
push id19535
push userMs2ger@gmail.com
push dateSat, 16 Apr 2011 09:25:05 +0000
treeherdermozilla-central@bee1149208a9 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjbramley
bugs634557
milestone6.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 634557 - Implement ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, r=jbramley
gfx/ycbcr/Makefile.in
gfx/ycbcr/ycbcr_to_rgb565.cpp
gfx/ycbcr/yuv_row_arm.s
--- a/gfx/ycbcr/Makefile.in
+++ b/gfx/ycbcr/Makefile.in
@@ -71,18 +71,22 @@ else
 CPPSRCS += yuv_row_other.cpp \
            $(NULL)
 endif # Darwin
 endif # SunOS
 endif # linux
 endif # windows
 
 ifeq (arm,$(findstring arm,$(OS_TEST)))
+ifdef HAVE_ARM_NEON
 CPPSRCS += yuv_convert_arm.cpp \
            $(NULL)
+ASFILES = yuv_row_arm.$(ASM_SUFFIX) \
+          $(NULL)
+endif
 endif
 
 EXTRA_DSO_LDOPTS += \
         $(LIBS_DIR) \
         $(EXTRA_DSO_LIBS) \
         $(XPCOM_LIBS) \
         $(NSPR_LIBS) \
         $(NULL)
--- a/gfx/ycbcr/ycbcr_to_rgb565.cpp
+++ b/gfx/ycbcr/ycbcr_to_rgb565.cpp
@@ -96,16 +96,19 @@ typedef void (*yuv2rgb565_row_scale_bili
 
 typedef void (*yuv2rgb565_row_scale_nearest_func)(
  const yuv2rgb565_row_scale_nearest_ctx *ctx, int dither);
 
 
 
 # if defined(MOZILLA_MAY_SUPPORT_NEON)
 
+extern "C" void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+ const yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+
 void __attribute((noinline)) yuv42x_to_rgb565_row_neon(uint16 *dst,
                                                        const uint8 *y,
                                                        const uint8 *u,
                                                        const uint8 *v,
                                                        int n,
                                                        int oddflag);
 
 #endif
@@ -362,16 +365,25 @@ NS_GFX_(void) ScaleYCbCrToRGB565(const P
      padded reference frames).
     In practice, we do not even _have_ the actual bounds of the source, as
      we are passed a crop rectangle from it, and not the dimensions of the full
      image.
     This assertion will not guarantee our out-of-bounds reads are safe, but it
      should at least catch the simple case of passing in an unpadded buffer.*/
   NS_ASSERTION(abs(y_pitch) >= abs(source_width)+16,
     "ScaleYCbCrToRGB565 source image unpadded?");
+  /*The NEON code requires the pointers to be aligned to a 16-byte boundary at
+     the start of each row.
+    This should be true for all of our sources.
+    We could try to fix this up if it's not true by adjusting source_x0, but
+     that would require the mis-alignment to be the same for the U and V
+     planes.*/
+  NS_ASSERTION((y_pitch&15) == 0 && (uv_pitch&15) == 0 &&
+   ((y_buf-NULL)&15) == 0 && ((u_buf-NULL)&15) == 0 && ((v_buf-NULL)&15) == 0,
+   "ScaleYCbCrToRGB565 source image unaligned");
   /*We take an area-based approach to pixel coverage to avoid shifting by small
      amounts (or not so small, when up-scaling or down-scaling by a large
      factor).
 
     An illustrative example: scaling 4:2:0 up by 2, using JPEG chroma cositing^.
 
     + = RGB destination locations
     * = Y' source locations
@@ -488,18 +500,23 @@ NS_GFX_(void) ScaleYCbCrToRGB565(const P
      CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
     if (uvxscale_min <= abs(source_dx_q16)
      && abs(source_dx_q16) <= uvxscale_max
      && uvyscale_min <= abs(source_dy_q16)
      && abs(source_dy_q16) <= uvyscale_max) {
       /*Add the rounding offsets now.*/
       source_uv_xoffs_q16 += 1<<(15+x_shift);
       source_uv_yoffs_q16 += 1<<(15+y_shift);
-      if (yuv_type != YV24)
-        scale_row = ScaleYCbCr42xToRGB565_BilinearY_Row_C;
+      if (yuv_type != YV24) {
+        scale_row =
+#  if defined(MOZILLA_MAY_SUPPORT_NEON)
+         supports_neon() ? ScaleYCbCr42xToRGB565_BilinearY_Row_NEON :
+#  endif
+         ScaleYCbCr42xToRGB565_BilinearY_Row_C;
+      }
       else
         scale_row = ScaleYCbCr444ToRGB565_BilinearY_Row_C;
     }
     else {
       if (yuv_type == YV12)
         scale_row = ScaleYCbCr420ToRGB565_Bilinear_Row_C;
       else if (yuv_type == YV16)
         scale_row = ScaleYCbCr422ToRGB565_Bilinear_Row_C;
@@ -554,16 +571,46 @@ NS_GFX_(bool) IsScaleYCbCrToRGB565Fast(i
                                        int source_y0,
                                        int source_width,
                                        int source_height,
                                        int width,
                                        int height,
                                        YUVType yuv_type,
                                        ScaleFilter filter)
 {
+  // Very fast.
+  if (width <= 0 || height <= 0)
+    return true;
+#  if defined(MOZILLA_MAY_SUPPORT_NEON)
+  if (filter != FILTER_NONE) {
+    int source_dx_q16;
+    int source_dy_q16;
+    int uvxscale_min;
+    int uvxscale_max;
+    int uvyscale_min;
+    int uvyscale_max;
+    source_dx_q16 = (source_width<<16) / width;
+    source_dy_q16 = (source_height<<16) / height;
+    uvxscale_min = yuv_type != YV24 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+    uvxscale_max = yuv_type != YV24 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+    uvyscale_min = yuv_type == YV12 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MIN : CHROMA_NEAREST_NORMAL_STEP_MIN;
+    uvyscale_max = yuv_type == YV12 ?
+     CHROMA_NEAREST_SUBSAMP_STEP_MAX : CHROMA_NEAREST_NORMAL_STEP_MAX;
+    if (uvxscale_min <= abs(source_dx_q16)
+     && abs(source_dx_q16) <= uvxscale_max
+     && uvyscale_min <= abs(source_dy_q16)
+     && abs(source_dy_q16) <= uvyscale_max) {
+      if (yuv_type != YV24)
+        return supports_neon();
+    }
+  }
+#  endif
   return false;
 }
 
 
 
 void yuv_to_rgb565_row_c(uint16 *dst,
                          const uint8 *y,
                          const uint8 *u,
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_arm.s
@@ -0,0 +1,329 @@
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is NEON YUV-to-RGB565 scaling code.
+ *
+ * The Initial Developer of the Original Code is the Mozilla Foundation.
+ * Portions created by the Initial Developer are Copyright (C) 2011
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Timothy B. Terriberry <tterriberry@mozilla.com>
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+    .arch   armv7-a
+    .fpu    neon
+    .text
+    .align
+
+    .balign 64
+YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
+    .short -14240
+    .short -14240+384
+    .short   8672
+    .short   8672+192
+    .short -17696
+    .short -17696+384
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
+    .short -14240+128
+    .short -14240+256
+    .short   8672+64
+    .short   8672+128
+    .short -17696+128
+    .short -17696+256
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
+    .short -14240+256
+    .short -14240+128
+    .short   8672+128
+    .short   8672+64
+    .short -17696+256
+    .short -17696+128
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
+    .short -14240+384
+    .short -14240
+    .short   8672+192
+    .short   8672
+    .short -17696+384
+    .short -17696
+    .byte 102
+    .byte  25
+    .byte  52
+    .byte 129
+
+@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
+@  yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
+@
+@ ctx = {
+@   PRUint16 *rgb_row;       /*r0*/
+@   const PRUint8 *y_row;    /*r1*/
+@   const PRUint8 *u_row;    /*r2*/
+@   const PRUint8 *v_row;    /*r3*/
+@   int y_yweight;           /*r4*/
+@   int y_pitch;             /*r5*/
+@   int width;               /*r6*/
+@   int source_x0_q16;       /*r7*/
+@   int source_dx_q16;       /*r8*/
+@   int source_uv_xoffs_q16; /*r9*/
+@ };
+    .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+    .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
+    .balign 64
+    .fnstart
+ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
+    STMFD       r13!,{r4-r9,r14}       @ 8 words.
+    ADR         r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
+    VPUSH       {Q4-Q7}                @ 16 words.
+    ADD         r14,r14,r1, LSL #4     @ Select the dither table to use
+    LDMIA       r0, {r0-r9}
+    @ Set up image index registers.
+    ADD         r12,r8, r8
+    VMOV.I32    D16,#0         @ Q8 = < 2| 2| 0| 0>*source_dx_q16
+    VDUP.32     D17,r12
+    ADD         r12,r12,r12
+    VTRN.32     D16,D17        @ Q2 = < 2| 0| 2| 0>*source_dx_q16
+    VDUP.32     D19,r12        @ Q9 = < 4| 4| ?| ?>*source_dx_q16
+    ADD         r12,r12,r12
+    VDUP.32     Q0, r7         @ Q0 = < 1| 1| 1| 1>*source_x0_q16
+    VADD.I32    D17,D17,D19    @ Q8 = < 6| 4| 2| 0>*source_dx_q16
+    CMP         r8, #0                 @ If source_dx_q16 is negative...
+    VDUP.32     Q9, r12        @ Q9 = < 8| 8| 8| 8>*source_dx_q16
+    ADDLT       r7, r7, r8, LSL #4     @ Make r7 point to the end of the block
+    VADD.I32    Q0, Q0, Q8     @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
+    SUBLT       r7, r7, r8             @ (i.e., the lowest address we'll use)
+    VADD.I32    Q1, Q0, Q9     @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
+    VDUP.I32    Q9, r8         @ Q8 = < 1| 1| 1| 1>*source_dx_q16
+    VADD.I32    Q2, Q0, Q9     @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
+    VADD.I32    Q3, Q1, Q9     @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
+    VLD1.64     {D30,D31},[r14,:128]   @ Load some constants
+    VMOV.I8     D28,#52
+    VMOV.I8     D29,#129
+    @ The basic idea here is to do aligned loads of a block of data and then
+    @  index into it using VTBL to extract the data from the source X
+    @  coordinate corresponding to each destination pixel.
+    @ This is significantly less code and significantly fewer cycles than doing
+    @  a series of single-lane loads, but it means that the X step between
+    @  pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
+    @  that we could read 8 pixels from a single aligned 32-byte block of data.
+    @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
+    @  separated into even pixels and odd pixels to make extracting offsets and
+    @  weights easier.
+    @ We then pull out two bytes from the middle of each coordinate: the top
+    @  byte corresponds to the integer part of the X coordinate, and the bottom
+    @  byte corresponds to the weight to use for bilinear blending.
+    @ These are separated out into different registers with VTRN.
+    @ Then by subtracting the integer X coordinate of the first pixel in the
+    @  data block we loaded, we produce an index register suitable for use by
+    @  VTBL.
+s42xbily_neon_loop:
+    @ Load the Y' data.
+    MOV         r12,r7, ASR #16
+    VRSHRN.S32  D16,Q0, #8
+    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
+    VDUP.I8     D20,r12
+    ADD         r12,r1, r12    @ r12 = y_row+(source_x&~7)
+    VRSHRN.S32  D17,Q1, #8
+    PLD         [r12,#64]
+    VLD1.64     {D8, D9, D10,D11},[r12,:128],r5        @ Load Y' top row
+    ADD         r14,r7, r8, LSL #3
+    VRSHRN.S32  D18,Q2, #8
+    MOV         r14,r14,ASR #16
+    VRSHRN.S32  D19,Q3, #8
+    AND         r14,r14,#~15   @ Read 16-byte aligned blocks
+    VLD1.64     {D12,D13,D14,D15},[r12,:128]           @ Load Y' bottom row
+    PLD         [r12,#64]
+    VDUP.I8     D21,r14
+    ADD         r14,r1, r14    @ r14 = y_row+(source_x&~7)
+    VMOV.I8     Q13,#1
+    PLD         [r14,#64]
+    VTRN.8      Q8, Q9         @ Q8  = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+                               @ Q9  = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+    VSUB.S8     Q9, Q9, Q10    @ Make offsets relative to the data we loaded.
+    @ First 8 Y' pixels
+    VTBL.8      D20,{D8, D9, D10,D11},D18      @ Index top row at source_x
+    VTBL.8      D24,{D12,D13,D14,D15},D18      @ Index bottom row at source_x
+    VADD.S8     Q13,Q9, Q13                    @ Add 1 to source_x
+    VTBL.8      D22,{D8, D9, D10,D11},D26      @ Index top row at source_x+1
+    VTBL.8      D26,{D12,D13,D14,D15},D26      @ Index bottom row at source_x+1
+    @ Next 8 Y' pixels
+    VLD1.64     {D8, D9, D10,D11},[r14,:128],r5        @ Load Y' top row
+    VLD1.64     {D12,D13,D14,D15},[r14,:128]           @ Load Y' bottom row
+    PLD         [r14,#64]
+    VTBL.8      D21,{D8, D9, D10,D11},D19      @ Index top row at source_x
+    VTBL.8      D25,{D12,D13,D14,D15},D19      @ Index bottom row at source_x
+    VTBL.8      D23,{D8, D9, D10,D11},D27      @ Index top row at source_x+1
+    VTBL.8      D27,{D12,D13,D14,D15},D27      @ Index bottom row at source_x+1
+    @ Blend Y'.
+    VDUP.I16    Q9, r4         @ Load the y weights.
+    VSUBL.U8    Q4, D24,D20    @ Q5:Q4 = c-a
+    VSUBL.U8    Q5, D25,D21
+    VSUBL.U8    Q6, D26,D22    @ Q7:Q6 = d-b
+    VSUBL.U8    Q7, D27,D23
+    VMUL.S16    Q4, Q4, Q9     @ Q5:Q4 = (c-a)*yweight
+    VMUL.S16    Q5, Q5, Q9
+    VMUL.S16    Q6, Q6, Q9     @ Q7:Q6 = (d-b)*yweight
+    VMUL.S16    Q7, Q7, Q9
+    VMOVL.U8    Q12,D16        @ Promote the x weights to 16 bits.
+    VMOVL.U8    Q13,D17        @ Sadly, there's no VMULW.
+    VRSHRN.S16  D8, Q4, #8     @ Q4 = (c-a)*yweight+128>>8
+    VRSHRN.S16  D9, Q5, #8
+    VRSHRN.S16  D12,Q6, #8     @ Q6 = (d-b)*yweight+128>>8
+    VRSHRN.S16  D13,Q7, #8
+    VADD.I8     Q10,Q10,Q4     @ Q10 = a+((c-a)*yweight+128>>8)
+    VADD.I8     Q11,Q11,Q6     @ Q11 = b+((d-b)*yweight+128>>8)
+    VSUBL.U8    Q4, D22,D20    @ Q5:Q4 = b-a
+    VSUBL.U8    Q5, D23,D21
+    VMUL.S16    Q4, Q4, Q12    @ Q5:Q4 = (b-a)*xweight
+    VMUL.S16    Q5, Q5, Q13
+    VRSHRN.S16  D8, Q4, #8     @ Q4 = (b-a)*xweight+128>>8
+    ADD         r12,r7, r9
+    VRSHRN.S16  D9, Q5, #8
+    MOV         r12,r12,ASR #17
+    VADD.I8     Q8, Q10,Q4     @ Q8 = a+((b-a)*xweight+128>>8)
+    @ Start extracting the chroma x coordinates, and load Cb and Cr.
+    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
+    VDUP.I32    Q9, r9         @ Q9 = source_uv_xoffs_q16 x 4
+    ADD         r14,r2, r12
+    VADD.I32    Q10,Q0, Q9
+    VLD1.64     {D8, D9, D10,D11},[r14,:128]   @ Load Cb
+    PLD         [r14,#64]
+    VADD.I32    Q11,Q1, Q9
+    ADD         r14,r3, r12
+    VADD.I32    Q12,Q2, Q9
+    VLD1.64     {D12,D13,D14,D15},[r14,:128]   @ Load Cr
+    PLD         [r14,#64]
+    VADD.I32    Q13,Q3, Q9
+    VRSHRN.S32  D20,Q10,#9     @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
+    VRSHRN.S32  D21,Q11,#9
+    VDUP.I8     Q9, r12
+    VRSHRN.S32  D22,Q12,#9     @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
+    VRSHRN.S32  D23,Q13,#9
+    @ We don't actually need the x weights, but we get them for free.
+    @ Free ALU slot
+    VTRN.8      Q10,Q11        @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
+    @ Free ALU slot            @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
+    VSUB.S8     Q11,Q11,Q9     @ Make offsets relative to the data we loaded.
+    VTBL.8      D18,{D8, D9, D10,D11},D22      @ Index Cb at source_x
+    VMOV.I8     D24,#74
+    VTBL.8      D19,{D8, D9, D10,D11},D23
+    VMOV.I8     D26,#102
+    VTBL.8      D20,{D12,D13,D14,D15},D22      @ Index Cr at source_x
+    VMOV.I8     D27,#25
+    VTBL.8      D21,{D12,D13,D14,D15},D23
+    @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
+    @ We use VDUP to expand constants, because it's a permute instruction, so
+    @  it can dual issue on the A8.
+    SUBS        r6, r6, #16    @ width -= 16
+    VMULL.U8    Q4, D16,D24    @  Q5:Q4  = Y'*74
+    VDUP.32     Q6, D30[1]     @  Q7:Q6  = bias_G
+    VMULL.U8    Q5, D17,D24
+    VDUP.32     Q7, D30[1]
+    VMLSL.U8    Q6, D18,D27    @  Q7:Q6  = -25*Cb+bias_G
+    VDUP.32     Q11,D30[0]     @ Q12:Q11 = bias_R
+    VMLSL.U8    Q7, D19,D27
+    VDUP.32     Q12,D30[0]
+    VMLAL.U8    Q11,D20,D26    @ Q12:Q11 = 102*Cr+bias_R
+    VDUP.32     Q8, D31[0]     @ Q13:Q8  = bias_B
+    VMLAL.U8    Q12,D21,D26
+    VDUP.32     Q13,D31[0]
+    VMLAL.U8    Q8, D18,D29    @ Q13:Q8  = 129*Cb+bias_B
+    VMLAL.U8    Q13,D19,D29
+    VMLSL.U8    Q6, D20,D28    @  Q7:Q6  = -25*Cb-52*Cr+bias_G
+    VMLSL.U8    Q7, D21,D28
+    VADD.S16    Q11,Q4, Q11    @ Q12:Q11 = 74*Y'+102*Cr+bias_R
+    VADD.S16    Q12,Q5, Q12
+    VQADD.S16   Q8, Q4, Q8     @ Q13:Q8  = 74*Y'+129*Cr+bias_B
+    VQADD.S16   Q13,Q5, Q13
+    VADD.S16    Q6, Q4, Q6     @  Q7:Q6  = 74*Y'-25*Cb-52*Cr+bias_G
+    VADD.S16    Q7, Q5, Q7
+    @ Push each value to the top of its word and saturate it.
+    VQSHLU.S16 Q11,Q11,#2
+    VQSHLU.S16 Q12,Q12,#2
+    VQSHLU.S16 Q6, Q6, #2
+    VQSHLU.S16 Q7, Q7, #2
+    VQSHLU.S16 Q8, Q8, #2
+    VQSHLU.S16 Q13,Q13,#2
+    @ Merge G and B into R.
+    VSRI.U16   Q11,Q6, #5
+    VSRI.U16   Q12,Q7, #5
+    VSRI.U16   Q11,Q8, #11
+    MOV         r14,r8, LSL #4
+    VSRI.U16   Q12,Q13,#11
+    BLT s42xbily_neon_tail
+    VDUP.I32    Q13,r14
+    @ Store the result.
+    VST1.16     {D22,D23,D24,D25},[r0]!
+    BEQ s42xbily_neon_done
+    @ Advance the x coordinates.
+    VADD.I32    Q0, Q0, Q13
+    VADD.I32    Q1, Q1, Q13
+    ADD         r7, r14
+    VADD.I32    Q2, Q2, Q13
+    VADD.I32    Q3, Q3, Q13
+    B s42xbily_neon_loop
+s42xbily_neon_tail:
+    @ We have between 1 and 15 pixels left to write.
+    @ -r6 == the number of pixels we need to skip writing.
+    @ Adjust r0 to point to the last one we need to write, because we're going
+    @  to write them in reverse order.
+    ADD         r0, r0, r6, LSL #1
+    MOV         r14,#-2
+    ADD         r0, r0, #30
+    @ Skip past the ones we don't need to write.
+    SUB         PC, PC, r6, LSL #2
+    ORR         r0, r0, r0
+    VST1.16     {D25[3]},[r0,:16],r14
+    VST1.16     {D25[2]},[r0,:16],r14
+    VST1.16     {D25[1]},[r0,:16],r14
+    VST1.16     {D25[0]},[r0,:16],r14
+    VST1.16     {D24[3]},[r0,:16],r14
+    VST1.16     {D24[2]},[r0,:16],r14
+    VST1.16     {D24[1]},[r0,:16],r14
+    VST1.16     {D24[0]},[r0,:16],r14
+    VST1.16     {D23[3]},[r0,:16],r14
+    VST1.16     {D23[2]},[r0,:16],r14
+    VST1.16     {D23[1]},[r0,:16],r14
+    VST1.16     {D23[0]},[r0,:16],r14
+    VST1.16     {D22[3]},[r0,:16],r14
+    VST1.16     {D22[2]},[r0,:16],r14
+    VST1.16     {D22[1]},[r0,:16],r14
+    VST1.16     {D22[0]},[r0,:16]
+s42xbily_neon_done:
+    VPOP        {Q4-Q7}                @ 16 words.
+    LDMFD       r13!,{r4-r9,PC}        @ 8 words.
+    .fnend
+    .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON