Bug 1122900: Make libyuv compile with MSVC 2015, r=rjesup.
authorBrian Smith <brian@briansmith.org>
Tue, 31 Mar 2015 08:34:30 -1000
changeset 270327 8190c13db4204d475bc578102481cf7dafd1f03d
parent 270326 2622475e765b2ea1a937fba2107ebc4a328882ba
child 270328 aa01c3fd458c688e505b6f5c106ae4ff3bebe040
push id863
push userraliiev@mozilla.com
push dateMon, 03 Aug 2015 13:22:43 +0000
treeherdermozilla-release@f6321b14228d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersrjesup
bugs1122900
milestone40.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1122900: Make libyuv compile with MSVC 2015, r=rjesup. MSVC 2015 CTP 6 refuses to compile the code with these, and MSVC already aligns functions at 16 byte boundaries in its normal configuration.
media/libyuv/source/compare_win.cc
media/libyuv/source/rotate.cc
media/libyuv/source/row_win.cc
media/libyuv/source/scale_win.cc
--- a/media/libyuv/source/compare_win.cc
+++ b/media/libyuv/source/compare_win.cc
@@ -13,17 +13,17 @@
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]    // src_a
     mov        edx, [esp + 8]    // src_b
     mov        ecx, [esp + 12]   // count
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
@@ -55,17 +55,17 @@ uint32 SumSquareError_SSE2(const uint8* 
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]    // src_a
     mov        edx, [esp + 8]    // src_b
     mov        ecx, [esp + 12]   // count
     vpxor      ymm0, ymm0, ymm0  // sum
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
@@ -130,17 +130,17 @@ static uvec32 kHashMul3 = {
 // 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
 // 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
 // 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
 // 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
 // 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
     _asm _emit 0x40 _asm _emit reg
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
   __asm {
     mov        eax, [esp + 4]    // src
     mov        ecx, [esp + 8]    // count
     movd       xmm0, [esp + 12]  // seed
 
     pxor       xmm7, xmm7        // constant 0 for unpck
     movdqa     xmm6, kHash16x33
@@ -182,17 +182,17 @@ uint32 HashDjb2_SSE41(const uint8* src, 
 
     movd       eax, xmm0         // return hash
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
   __asm {
     mov        eax, [esp + 4]    // src
     mov        ecx, [esp + 8]    // count
     movd       xmm0, [esp + 12]  // seed
     movdqa     xmm6, kHash16x33
 
     align      4
--- a/media/libyuv/source/rotate.cc
+++ b/media/libyuv/source/rotate.cc
@@ -71,17 +71,17 @@ void TransposeUVWx8_MIPS_DSPR2(const uin
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
                                int width);
 #endif  // defined(__mips__)
 
 #if !defined(LIBYUV_DISABLE_X86) && \
     defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                                uint8* dst, int dst_stride, int width) {
   __asm {
     push      edi
     push      esi
     push      ebp
     mov       eax, [esp + 12 + 4]   // src
     mov       edi, [esp + 12 + 8]   // src_stride
@@ -163,17 +163,17 @@ static void TransposeWx8_SSSE3(const uin
     pop       ebp
     pop       esi
     pop       edi
     ret
   }
 }
 
 #define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                 uint8* dst_a, int dst_stride_a,
                                 uint8* dst_b, int dst_stride_b,
                                 int w) {
   __asm {
     push      ebx
     push      esi
     push      edi
--- a/media/libyuv/source/row_win.cc
+++ b/media/libyuv/source/row_win.cc
@@ -139,17 +139,17 @@ static const uvec8 kShuffleMaskARGBToRGB
 };
 
 // Shuffle table for converting ARGB to RAW.
 static const uvec8 kShuffleMaskARGBToRAW_0 = {
   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
 };
 
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   __asm {
     mov        eax, [esp + 4]        // src_y
     mov        edx, [esp + 8]        // dst_argb
     mov        ecx, [esp + 12]       // pix
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
 
@@ -167,17 +167,17 @@ void I400ToARGBRow_SSE2(const uint8* src
     movdqa     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
                                   int pix) {
   __asm {
     mov        eax, [esp + 4]        // src_y
     mov        edx, [esp + 8]        // dst_argb
     mov        ecx, [esp + 12]       // pix
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
@@ -196,17 +196,17 @@ void I400ToARGBRow_Unaligned_SSE2(const 
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_rgb24
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRGB24ToARGB
@@ -235,17 +235,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* s
     sub       ecx, 16
     movdqa    [edx + 48], xmm3
     lea       edx, [edx + 64]
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
                         int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_raw
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
     pslld     xmm5, 24
@@ -282,17 +282,17 @@ void RAWToARGBRow_SSSE3(const uint8* src
 
 // pmul method to replicate bits.
 // Math to replicate bits:
 // (v << 8) | (v << 3)
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                           int pix) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
     pshufd    xmm5, xmm5, 0
     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     movd      xmm6, eax
@@ -333,17 +333,17 @@ void RGB565ToARGBRow_SSE2(const uint8* s
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
     ret
   }
 }
 
 // 24 instructions
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
     pshufd    xmm5, xmm5, 0
     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     movd      xmm6, eax
@@ -387,17 +387,17 @@ void ARGB1555ToARGBRow_SSE2(const uint8*
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
     ret
   }
 }
 
 // 18 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
   __asm {
     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
     movd      xmm4, eax
     pshufd    xmm4, xmm4, 0
     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
     pslld     xmm5, 4
@@ -426,17 +426,17 @@ void ARGB4444ToARGBRow_SSE2(const uint8*
     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRGB24
 
     align      4
@@ -465,17 +465,17 @@ void ARGBToRGB24Row_SSSE3(const uint8* s
     movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRAW
 
     align      4
@@ -504,17 +504,17 @@ void ARGBToRAWRow_SSSE3(const uint8* src
     movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
     psrld     xmm3, 27
     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
@@ -543,17 +543,17 @@ void ARGBToRGB565Row_SSE2(const uint8* s
     lea       edx, [edx + 8]
     sub       ecx, 4
     jg        convertloop
     ret
   }
 }
 
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
     psrld     xmm4, 27
     movdqa    xmm5, xmm4       // generate mask 0x000003e0
@@ -585,17 +585,17 @@ void ARGBToARGB1555Row_SSE2(const uint8*
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
     lea       edx, [edx + 8]
     sub       ecx, 4
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
     mov       ecx, [esp + 12]  // pix
     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
     psllw     xmm4, 12
     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
@@ -616,17 +616,17 @@ void ARGBToARGB4444Row_SSE2(const uint8*
     lea       edx, [edx + 8]
     sub       ecx, 4
     jg        convertloop
     ret
   }
 }
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
 
@@ -651,17 +651,17 @@ void ARGBToYRow_SSSE3(const uint8* src_a
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
@@ -688,17 +688,17 @@ void ARGBToYJRow_SSSE3(const uint8* src_
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToY
     vbroadcastf128 ymm5, kAddY16
     vmovdqa    ymm6, kPermdARGBToY_AVX
@@ -728,17 +728,17 @@ void ARGBToYRow_AVX2(const uint8* src_ar
     vzeroupper
     ret
   }
 }
 #endif  //  HAS_ARGBTOYROW_AVX2
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToYJ
     vbroadcastf128 ymm5, kAddYJ64
     vmovdqa    ymm6, kPermdARGBToY_AVX
@@ -768,17 +768,17 @@ void ARGBToYJRow_AVX2(const uint8* src_a
     jg         convertloop
 
     vzeroupper
     ret
   }
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
 
@@ -802,17 +802,17 @@ void ARGBToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
@@ -837,17 +837,17 @@ void ARGBToYJRow_Unaligned_SSSE3(const u
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
 
@@ -871,17 +871,17 @@ void BGRAToYRow_SSSE3(const uint8* src_a
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
 
@@ -905,17 +905,17 @@ void BGRAToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
 
@@ -939,17 +939,17 @@ void ABGRToYRow_SSSE3(const uint8* src_a
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
 
@@ -973,17 +973,17 @@ void ABGRToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
 
@@ -1007,17 +1007,17 @@ void RGBAToYRow_SSSE3(const uint8* src_a
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
 
@@ -1041,17 +1041,17 @@ void RGBAToYRow_Unaligned_SSSE3(const ui
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1107,17 +1107,17 @@ void ARGBToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                         uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1175,17 +1175,17 @@ void ARGBToUVJRow_SSSE3(const uint8* src
 
     pop        edi
     pop        esi
     ret
   }
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1241,17 +1241,17 @@ void ARGBToUVRow_AVX2(const uint8* src_a
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1311,17 +1311,17 @@ void ARGBToUVRow_Unaligned_SSSE3(const u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1382,17 +1382,17 @@ void ARGBToUVJRow_Unaligned_SSSE3(const 
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1440,17 +1440,17 @@ void ARGBToUV444Row_SSSE3(const uint8* s
     lea        edx,  [edx + 16]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
                                     uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1498,17 +1498,17 @@ void ARGBToUV444Row_Unaligned_SSSE3(cons
     lea        edx,  [edx + 16]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1557,17 +1557,17 @@ void ARGBToUV422Row_SSSE3(const uint8* s
     lea        edx, [edx + 8]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
                                     uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]   // src_argb
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
@@ -1616,17 +1616,17 @@ void ARGBToUV422Row_Unaligned_SSSE3(cons
     lea        edx, [edx + 8]
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1682,17 +1682,17 @@ void BGRAToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1752,17 +1752,17 @@ void BGRAToUVRow_Unaligned_SSSE3(const u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1818,17 +1818,17 @@ void ABGRToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1888,17 +1888,17 @@ void ABGRToUVRow_Unaligned_SSSE3(const u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -1954,17 +1954,17 @@ void RGBAToUVRow_SSSE3(const uint8* src_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb
     mov        esi, [esp + 8 + 8]   // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
@@ -2072,17 +2072,17 @@ static const lvec16 kUVBiasG_AVX = {
   BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
 };
 static const lvec16 kUVBiasR_AVX = {
   BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
 };
 
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_AVX2(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2271,17 +2271,17 @@ static const vec16 kUVBiasR = { BR, BR, 
     __asm psraw      xmm2, 6                                                   \
     __asm packuswb   xmm0, xmm0           /* B */                              \
     __asm packuswb   xmm1, xmm1           /* G */                              \
     __asm packuswb   xmm2, xmm2           /* R */                              \
   }
 
 // 8 pixels, dest aligned 16.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2314,17 +2314,17 @@ void I444ToARGBRow_SSSE3(const uint8* y_
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
                           const uint8* u_buf,
                           const uint8* v_buf,
                           uint8* dst_rgb24,
                           int width) {
   __asm {
     push       esi
     push       edi
@@ -2361,17 +2361,17 @@ void I422ToRGB24Row_SSSE3(const uint8* y
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRAWRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* dst_raw,
                         int width) {
   __asm {
     push       esi
     push       edi
@@ -2408,17 +2408,17 @@ void I422ToRAWRow_SSSE3(const uint8* y_b
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest unaligned.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb565_buf,
                            int width) {
   __asm {
     push       esi
     push       edi
@@ -2481,17 +2481,17 @@ void I422ToRGB565Row_SSSE3(const uint8* 
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2525,17 +2525,17 @@ void I422ToARGBRow_SSSE3(const uint8* y_
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       ebx
     push       esi
@@ -2570,17 +2570,17 @@ void I411ToARGBRow_SSSE3(const uint8* y_
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // UV
@@ -2608,17 +2608,17 @@ void NV12ToARGBRow_SSSE3(const uint8* y_
 
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // VU
@@ -2646,17 +2646,17 @@ void NV21ToARGBRow_SSSE3(const uint8* y_
 
     pop        esi
     ret
   }
 }
 
 // 8 pixels, unaligned.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -2689,17 +2689,17 @@ void I444ToARGBRow_Unaligned_SSSE3(const
     pop        edi
     pop        esi
     ret
   }
 }
 
 // 8 pixels, unaligned.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -2733,17 +2733,17 @@ void I422ToARGBRow_Unaligned_SSSE3(const
     pop        esi
     ret
   }
 }
 
 // 8 pixels, unaligned.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       ebx
     push       esi
@@ -2778,17 +2778,17 @@ void I411ToARGBRow_Unaligned_SSSE3(const
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* uv_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // UV
@@ -2816,17 +2816,17 @@ void NV12ToARGBRow_Unaligned_SSSE3(const
 
     pop        esi
     ret
   }
 }
 
 // 8 pixels, dest aligned 16.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* uv_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // VU
@@ -2852,17 +2852,17 @@ void NV21ToARGBRow_Unaligned_SSSE3(const
     sub        ecx, 8
     jg         convertloop
 
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_bgra,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2893,17 +2893,17 @@ void I422ToBGRARow_SSSE3(const uint8* y_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_bgra,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -2934,17 +2934,17 @@ void I422ToBGRARow_Unaligned_SSSE3(const
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToABGRRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_abgr,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -2975,17 +2975,17 @@ void I422ToABGRRow_SSSE3(const uint8* y_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_abgr,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -3016,17 +3016,17 @@ void I422ToABGRRow_Unaligned_SSSE3(const
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGBARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_rgba,
                          int width) {
   __asm {
     push       esi
     push       edi
@@ -3057,17 +3057,17 @@ void I422ToRGBARow_SSSE3(const uint8* y_
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* dst_rgba,
                                    int width) {
   __asm {
     push       esi
     push       edi
@@ -3101,17 +3101,17 @@ void I422ToRGBARow_Unaligned_SSSE3(const
     pop        esi
     ret
   }
 }
 
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 #ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YToARGBRow_SSE2(const uint8* y_buf,
                      uint8* rgb_buf,
                      int width) {
   __asm {
     pxor       xmm5, xmm5
     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
     pslld      xmm4, 24
     mov        eax, 0x00100010
@@ -3154,17 +3154,17 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 #endif  // HAS_YTOARGBROW_SSE2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 static const uvec8 kShuffleMirror = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, kShuffleMirror
     lea       eax, [eax - 16]
 
@@ -3183,17 +3183,17 @@ void MirrorRow_SSSE3(const uint8* src, u
 
 #ifdef HAS_MIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
 static const ulvec8 kShuffleMirror_AVX2 = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     vmovdqa   ymm5, kShuffleMirror_AVX2
     lea       eax, [eax - 32]
 
@@ -3210,17 +3210,17 @@ void MirrorRow_AVX2(const uint8* src, ui
     ret
   }
 }
 #endif  // HAS_MIRRORROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSE2
 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 // version can not.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16]
 
     align      4
@@ -3243,17 +3243,17 @@ void MirrorRow_SSE2(const uint8* src, ui
 #endif  // HAS_MIRRORROW_SSE2
 
 #ifdef HAS_MIRRORROW_UV_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
 static const uvec8 kShuffleMirrorUV = {
   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                        int width) {
   __asm {
     push      edi
     mov       eax, [esp + 4 + 4]   // src
     mov       edx, [esp + 4 + 8]   // dst_u
     mov       edi, [esp + 4 + 12]  // dst_v
     mov       ecx, [esp + 4 + 16]  // width
@@ -3279,17 +3279,17 @@ void MirrorUVRow_SSSE3(const uint8* src,
 #endif  // HAS_MIRRORROW_UV_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 static const uvec8 kARGBShuffleMirror = {
   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
     movdqa    xmm5, kARGBShuffleMirror
 
@@ -3308,17 +3308,17 @@ void ARGBMirrorRow_SSSE3(const uint8* sr
 #endif  // HAS_ARGBMIRRORROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
 static const ulvec32 kARGBShuffleMirror_AVX2 = {
   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 32]
     vmovdqa   ymm5, kARGBShuffleMirror_AVX2
 
@@ -3331,17 +3331,17 @@ void ARGBMirrorRow_AVX2(const uint8* src
     jg        convertloop
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
@@ -3367,17 +3367,17 @@ void SplitUVRow_SSE2(const uint8* src_uv
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                                int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -3406,17 +3406,17 @@ void SplitUVRow_Unaligned_SSE2(const uin
 
     pop        edi
     ret
   }
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
@@ -3445,17 +3445,17 @@ void SplitUVRow_AVX2(const uint8* src_uv
     pop        edi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_u
     mov        edx, [esp + 4 + 8]    // src_v
     mov        edi, [esp + 4 + 12]   // dst_uv
     mov        ecx, [esp + 4 + 16]   // width
@@ -3475,17 +3475,17 @@ void MergeUVRow_SSE2(const uint8* src_u,
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
                                uint8* dst_uv, int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_u
     mov        edx, [esp + 4 + 8]    // src_v
     mov        edi, [esp + 4 + 12]   // dst_uv
     mov        ecx, [esp + 4 + 16]   // width
@@ -3507,17 +3507,17 @@ void MergeUVRow_Unaligned_SSE2(const uin
 
     pop        edi
     ret
   }
 }
 #endif  //  HAS_MERGEUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_u
     mov        edx, [esp + 4 + 8]    // src_v
     mov        edi, [esp + 4 + 12]   // dst_uv
     mov        ecx, [esp + 4 + 16]   // width
@@ -3542,17 +3542,17 @@ void MergeUVRow_AVX2(const uint8* src_u,
     vzeroupper
     ret
   }
 }
 #endif  //  HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
 
     align      4
   convertloop:
@@ -3565,33 +3565,33 @@ void CopyRow_SSE2(const uint8* src, uint
     sub        ecx, 32
     jg         convertloop
     ret
   }
 }
 #endif  // HAS_COPYROW_SSE2
 
 // Unaligned Multiple of 1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
     mov        esi, [esp + 4]   // src
     mov        edi, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     rep movsb
     mov        edi, edx
     mov        esi, eax
     ret
   }
 }
 
 #ifdef HAS_COPYROW_X86
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
     mov        esi, [esp + 4]   // src
     mov        edi, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
@@ -3600,17 +3600,17 @@ void CopyRow_X86(const uint8* src, uint8
     mov        esi, eax
     ret
   }
 }
 #endif  // HAS_COPYROW_X86
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
     pslld      xmm0, 24
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
@@ -3637,17 +3637,17 @@ void ARGBCopyAlphaRow_SSE2(const uint8* 
 
     ret
   }
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
@@ -3667,17 +3667,17 @@ void ARGBCopyAlphaRow_AVX2(const uint8* 
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
     pslld      xmm0, 24
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
@@ -3706,17 +3706,17 @@ void ARGBCopyYToAlphaRow_SSE2(const uint
 
     ret
   }
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
@@ -3738,32 +3738,32 @@ void ARGBCopyYToAlphaRow_AVX2(const uint
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SetRow_X86(uint8* dst, uint32 v32, int count) {
   __asm {
     mov        edx, edi
     mov        edi, [esp + 4]   // dst
     mov        eax, [esp + 8]   // v32
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
     rep stosd
     mov        edi, edx
     ret
   }
 }
 
 // SetRow32 writes 'count' words using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
                    int dst_stride, int height) {
   __asm {
     push       esi
     push       edi
     push       ebp
     mov        edi, [esp + 12 + 4]   // dst
     mov        eax, [esp + 12 + 8]   // v32
@@ -3785,17 +3785,17 @@ void ARGBSetRows_X86(uint8* dst, uint32 
     pop        edi
     pop        esi
     ret
   }
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
@@ -3813,17 +3813,17 @@ void YUY2ToYRow_AVX2(const uint8* src_yu
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
     jg         convertloop
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -3858,17 +3858,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_y
 
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -3898,17 +3898,17 @@ void YUY2ToUV422Row_AVX2(const uint8* sr
     jg         convertloop
 
     pop        edi
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_AVX2(const uint8* src_uyvy,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
     align      4
@@ -3924,17 +3924,17 @@ void UYVYToYRow_AVX2(const uint8* src_uy
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
     jg         convertloop
     ret
     vzeroupper
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -3969,17 +3969,17 @@ void UYVYToUVRow_AVX2(const uint8* src_u
 
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4011,17 +4011,17 @@ void UYVYToUV422Row_AVX2(const uint8* sr
     pop        edi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
@@ -4037,17 +4037,17 @@ void YUY2ToYRow_SSE2(const uint8* src_yu
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4081,17 +4081,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_y
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4118,17 +4118,17 @@ void YUY2ToUV422Row_SSE2(const uint8* sr
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
                                uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_yuy2
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
@@ -4144,17 +4144,17 @@ void YUY2ToYRow_Unaligned_SSE2(const uin
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
                                 uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4188,17 +4188,17 @@ void YUY2ToUVRow_Unaligned_SSE2(const ui
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
                                    uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4225,17 +4225,17 @@ void YUY2ToUV422Row_Unaligned_SSE2(const
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_SSE2(const uint8* src_uyvy,
                      uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
     align      4
@@ -4249,17 +4249,17 @@ void UYVYToYRow_SSE2(const uint8* src_uy
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4293,17 +4293,17 @@ void UYVYToUVRow_SSE2(const uint8* src_u
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4330,17 +4330,17 @@ void UYVYToUV422Row_SSE2(const uint8* sr
     sub        ecx, 16
     jg         convertloop
 
     pop        edi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                                uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_uyvy
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
     align      4
@@ -4354,17 +4354,17 @@ void UYVYToYRow_Unaligned_SSE2(const uin
     sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
                                 uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
     mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
@@ -4398,17 +4398,17 @@ void UYVYToUVRow_Unaligned_SSE2(const ui
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
                                    uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
     mov        edx, [esp + 4 + 8]    // dst_u
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -4438,17 +4438,17 @@ void UYVYToUV422Row_Unaligned_SSE2(const
     pop        edi
     ret
   }
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -4572,17 +4572,17 @@ static const uvec8 kShuffleAlpha = {
 // Same as SSE2, but replaces:
 //    psrlw      xmm3, 8          // alpha
 //    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
 //    pshuflw    xmm3, xmm3, 0F5h
 // with..
 //    pshufb     xmm3, kShuffleAlpha // alpha
 // Blend 8 pixels at a time.
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -4720,17 +4720,17 @@ void ARGBBlendRow_SSSE3(const uint8* src
     ret
   }
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
     pslld      xmm4, 24
     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
@@ -4770,17 +4770,17 @@ void ARGBAttenuateRow_SSE2(const uint8* 
 // Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
 static const uvec8 kShuffleAlpha1 = {
   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
     pslld      xmm3, 24
     movdqa     xmm4, kShuffleAlpha0
@@ -4818,17 +4818,17 @@ void ARGBAttenuateRow_SSSE3(const uint8*
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const ulvec8 kShuffleAlpha_AVX2 = {
   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vmovdqa    ymm4, kShuffleAlpha_AVX2
     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
@@ -4857,17 +4857,17 @@ void ARGBAttenuateRow_AVX2(const uint8* 
     ret
   }
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_argb0
     mov        edx, [esp + 8 + 8]   // dst_argb
     mov        ecx, [esp + 8 + 12]  // width
@@ -4913,17 +4913,17 @@ void ARGBUnattenuateRow_SSE2(const uint8
 // Shuffle table duplicating alpha.
 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
 };
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
@@ -4948,17 +4948,17 @@ void ARGBUnattenuateRow_AVX2(const uint8
     lea        eax, [eax + 32]
     jg         convertloop
 
     vzeroupper
     ret
   }
 }
 #else  // USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
 
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
@@ -5016,17 +5016,17 @@ void ARGBUnattenuateRow_AVX2(const uint8
     ret
   }
 }
 #endif  // USE_GATHER
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_argb */
     mov        ecx, [esp + 12]  /* width */
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
@@ -5076,17 +5076,17 @@ static const vec8 kARGBToSepiaG = {
   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
 };
 
 static const vec8 kARGBToSepiaR = {
   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
 };
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* dst_argb */
     mov        ecx, [esp + 8]   /* width */
     movdqa     xmm2, kARGBToSepiaB
     movdqa     xmm3, kARGBToSepiaG
     movdqa     xmm4, kARGBToSepiaR
 
@@ -5134,17 +5134,17 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb,
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
 
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                               const int8* matrix_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_argb */
     mov        ecx, [esp + 12]  /* matrix_argb */
     movdqu     xmm5, [ecx]
     pshufd     xmm2, xmm5, 0x00
@@ -5197,17 +5197,17 @@ void ARGBColorMatrixRow_SSSE3(const uint
     ret
   }
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   __asm {
     mov        eax, [esp + 4]    /* dst_argb */
     movd       xmm2, [esp + 8]   /* scale */
     movd       xmm3, [esp + 12]  /* interval_size */
     movd       xmm4, [esp + 16]  /* interval_offset */
     mov        ecx, [esp + 20]   /* width */
@@ -5244,17 +5244,17 @@ void ARGBQuantizeRow_SSE2(uint8* dst_arg
     ret
   }
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
 // Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   __asm {
     mov        eax, [esp + 4]   // src_argb
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
@@ -5279,17 +5279,17 @@ void ARGBShadeRow_SSE2(const uint8* src_
 
     ret
   }
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5319,17 +5319,17 @@ void ARGBMultiplyRow_SSE2(const uint8* s
     ret
   }
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5368,17 +5368,17 @@ void ARGBAddRow_SSE2(const uint8* src_ar
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5398,17 +5398,17 @@ void ARGBSubtractRow_SSE2(const uint8* s
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5436,17 +5436,17 @@ void ARGBMultiplyRow_AVX2(const uint8* s
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5466,17 +5466,17 @@ void ARGBAddRow_AVX2(const uint8* src_ar
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBADDROW_AVX2
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_argb0
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5499,17 +5499,17 @@ void ARGBSubtractRow_AVX2(const uint8* s
 }
 #endif  // HAS_ARGBSUBTRACTROW_AVX2
 
 #ifdef HAS_SOBELXROW_SSE2
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   // src_y0
     mov        esi, [esp + 8 + 8]   // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
@@ -5556,17 +5556,17 @@ void SobelXRow_SSE2(const uint8* src_y0,
 }
 #endif  // HAS_SOBELXROW_SSE2
 
 #ifdef HAS_SOBELYROW_SSE2
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_y0
     mov        esi, [esp + 4 + 8]   // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
@@ -5610,17 +5610,17 @@ void SobelYRow_SSE2(const uint8* src_y0,
 #endif  // HAS_SOBELYROW_SSE2
 
 #ifdef HAS_SOBELROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
 // A = 255
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_sobelx
     mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5658,17 +5658,17 @@ void SobelRow_SSE2(const uint8* src_sobe
     pop        esi
     ret
   }
 }
 #endif  // HAS_SOBELROW_SSE2
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                           uint8* dst_y, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_sobelx
     mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5692,17 +5692,17 @@ void SobelToPlaneRow_SSE2(const uint8* s
 #endif  // HAS_SOBELTOPLANEROW_SSE2
 
 #ifdef HAS_SOBELXYROW_SSE2
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
 // A = 255
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_sobelx
     mov        esi, [esp + 4 + 8]   // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -5986,17 +5986,17 @@ void ComputeCumulativeSumRow_SSE2(const 
 
  l1b:
   }
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 12]  // src_argb
     mov        esi, [esp + 16]  // stride
@@ -6073,17 +6073,17 @@ void ARGBAffineRow_SSE2(const uint8* src
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride, int dst_width,
                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6174,17 +6174,17 @@ void InterpolateRow_AVX2(uint8* dst_ptr,
     vzeroupper
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
 #ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride, int dst_width,
                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6281,17 +6281,17 @@ void InterpolateRow_SSSE3(uint8* dst_ptr
     pop        esi
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6393,17 +6393,17 @@ void InterpolateRow_SSE2(uint8* dst_ptr,
     pop        edi
     pop        esi
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                     ptrdiff_t src_stride, int dst_width,
                                     int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6499,17 +6499,17 @@ void InterpolateRow_Unaligned_SSSE3(uint
     pop        edi
     pop        esi
     ret
   }
 }
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                                    ptrdiff_t src_stride, int dst_width,
                                    int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // dst_ptr
     mov        esi, [esp + 8 + 8]   // src_ptr
@@ -6610,17 +6610,17 @@ void InterpolateRow_Unaligned_SSE2(uint8
   xloop99:
     pop        edi
     pop        esi
     ret
   }
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // src_uv_stride
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -6635,17 +6635,17 @@ void HalfRow_SSE2(const uint8* src_uv, i
     lea        eax,  [eax + 16]
     jg         convertloop
     pop        edi
     ret
   }
 }
 
 #ifdef HAS_HALFROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_uv
     mov        edx, [esp + 4 + 8]    // src_uv_stride
     mov        edi, [esp + 4 + 12]   // dst_v
     mov        ecx, [esp + 4 + 16]   // pix
@@ -6662,17 +6662,17 @@ void HalfRow_AVX2(const uint8* src_uv, i
 
     pop        edi
     vzeroupper
     ret
   }
 }
 #endif  // HAS_HALFROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_bayer
     movd       xmm5, [esp + 12]  // selector
     mov        ecx, [esp + 16]   // pix
     pshufd     xmm5, xmm5, 0
@@ -6689,17 +6689,17 @@ void ARGBToBayerRow_SSSE3(const uint8* s
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
     jg         wloop
     ret
   }
 }
 
 // Specialized ARGB to Bayer that just isolates G channel.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                            uint32 selector, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_bayer
                                  // selector
     mov        ecx, [esp + 16]   // pix
     pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
@@ -6720,17 +6720,17 @@ void ARGBToBayerGGRow_SSE2(const uint8* 
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
     jg         wloop
     ret
   }
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
     movdqa     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
@@ -6746,17 +6746,17 @@ void ARGBShuffleRow_SSSE3(const uint8* s
     movdqa     [edx], xmm0
     movdqa     [edx + 16], xmm1
     lea        edx, [edx + 32]
     jg         wloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                     const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
     movdqa     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
@@ -6773,17 +6773,17 @@ void ARGBShuffleRow_Unaligned_SSSE3(cons
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     jg         wloop
     ret
   }
 }
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]     // src_argb
     mov        edx, [esp + 8]     // dst_argb
     mov        ecx, [esp + 12]    // shuffler
     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
     mov        ecx, [esp + 16]    // pix
@@ -6802,17 +6802,17 @@ void ARGBShuffleRow_AVX2(const uint8* sr
     jg         wloop
 
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   __asm {
     push       ebx
     push       esi
     mov        eax, [esp + 8 + 4]    // src_argb
     mov        edx, [esp + 8 + 8]    // dst_argb
     mov        esi, [esp + 8 + 12]   // shuffler
@@ -6928,17 +6928,17 @@ void ARGBShuffleRow_SSE2(const uint8* sr
 }
 
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
 
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_frame, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_y
@@ -6966,17 +6966,17 @@ void I422ToYUY2Row_SSE2(const uint8* src
     jg         convertloop
 
     pop        edi
     pop        esi
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToUYVYRow_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_frame, int width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_y
@@ -7005,17 +7005,17 @@ void I422ToUYVYRow_SSE2(const uint8* src
 
     pop        edi
     pop        esi
     ret
   }
 }
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
                             uint8* dst_argb, const float* poly,
                             int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   /* src_argb */
     mov        edx, [esp + 4 + 8]   /* dst_argb */
     mov        esi, [esp + 4 + 12]  /* poly */
@@ -7065,17 +7065,17 @@ void ARGBPolynomialRow_SSE2(const uint8*
     jg         convertloop
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
                             uint8* dst_argb, const float* poly,
                             int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_argb */
     mov        ecx, [esp + 12]   /* poly */
     vbroadcastf128 ymm4, [ecx]       // C0
@@ -7106,17 +7106,17 @@ void ARGBPolynomialRow_AVX2(const uint8*
     vzeroupper
     ret
   }
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
                            int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   /* dst_argb */
     mov        esi, [esp + 4 + 8]   /* table_argb */
     mov        ecx, [esp + 4 + 12]  /* width */
 
@@ -7141,17 +7141,17 @@ void ARGBColorTableRow_X86(uint8* dst_ar
     pop        esi
     ret
   }
 }
 #endif  // HAS_ARGBCOLORTABLEROW_X86
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   /* dst_argb */
     mov        esi, [esp + 4 + 8]   /* table_argb */
     mov        ecx, [esp + 4 + 12]  /* width */
 
     // 1 pixel loop.
@@ -7173,17 +7173,17 @@ void RGBColorTableRow_X86(uint8* dst_arg
     pop        esi
     ret
   }
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                  int width,
                                  const uint8* luma, uint32 lumacoeff) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]   /* src_argb */
     mov        edi, [esp + 8 + 8]   /* dst_argb */
--- a/media/libyuv/source/scale_win.cc
+++ b/media/libyuv/source/scale_win.cc
@@ -89,17 +89,17 @@ static uvec8 kShufAb2 =
   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
 
 // Scaling values for boxes of 3x2 and 2x2
 static uvec16 kScaleAb2 =
   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
 
@@ -117,17 +117,17 @@ void ScaleRowDown2_SSE2(const uint8* src
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x1 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
@@ -155,17 +155,17 @@ void ScaleRowDown2Linear_SSE2(const uint
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
     mov        ecx, [esp + 4 + 16]   // dst_width
@@ -199,17 +199,17 @@ void ScaleRowDown2Box_SSE2(const uint8* 
 
     pop        esi
     ret
   }
 }
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
@@ -228,17 +228,17 @@ void ScaleRowDown2_Unaligned_SSE2(const 
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x1 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
                                         ptrdiff_t src_stride,
                                         uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
@@ -267,17 +267,17 @@ void ScaleRowDown2Linear_Unaligned_SSE2(
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -312,17 +312,17 @@ void ScaleRowDown2Box_Unaligned_SSE2(con
 
     pop        esi
     ret
   }
 }
 
 // Point samples 32 pixels to 8 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
@@ -345,17 +345,17 @@ void ScaleRowDown4_SSE2(const uint8* src
     jg         wloop
 
     ret
   }
 }
 
 // Blends 32x4 rectangle to 8x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_ptr
     mov        esi, [esp + 8 + 8]    // src_stride
     mov        edx, [esp + 8 + 12]   // dst_ptr
@@ -410,17 +410,17 @@ void ScaleRowDown4Box_SSE2(const uint8* 
 }
 
 // Point samples 32 pixels to 24 pixels.
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     movdqa     xmm3, kShuf0
@@ -459,17 +459,17 @@ void ScaleRowDown34_SSSE3(const uint8* s
 // xmm3 shuf 1
 // xmm4 shuf 2
 // xmm5 madd 0
 // xmm6 madd 1
 // xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -518,17 +518,17 @@ void ScaleRowDown34_1_Box_SSSE3(const ui
 
     pop        esi
     ret
   }
 }
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -581,17 +581,17 @@ void ScaleRowDown34_0_Box_SSSE3(const ui
     pop        esi
     ret
   }
 }
 
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
     movdqa     xmm4, kShuf38a
@@ -613,17 +613,17 @@ void ScaleRowDown38_SSSE3(const uint8* s
     lea        edx, [edx + 12]
     jg         xloop
 
     ret
   }
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -679,17 +679,17 @@ void ScaleRowDown38_3_Box_SSSE3(const ui
     jg         xloop
 
     pop        esi
     ret
   }
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_ptr
@@ -725,17 +725,17 @@ void ScaleRowDown38_2_Box_SSSE3(const ui
 
     pop        esi
     ret
   }
 }
 
 // Reads 16xN bytes and produces 16 shorts at a time.
 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint16* dst_ptr, int src_width,
                        int src_height) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
@@ -795,17 +795,17 @@ void ScaleAddRows_SSE2(const uint8* src_
 // TODO(fbarchard): Switch the following:
 //    xor        ebx, ebx
 //    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
 // To
 //    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
 // when drmemory bug fixed.
 // https://code.google.com/p/drmemory/issues/detail?id=1396
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                            int dst_width, int x, int dx) {
   __asm {
     push       ebx
     push       esi
     push       edi
     mov        edi, [esp + 12 + 4]    // dst_ptr
     mov        esi, [esp + 12 + 8]    // src_ptr
@@ -876,17 +876,17 @@ void ScaleFilterCols_SSSE3(uint8* dst_pt
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // Reads 16 pixels, duplicates them and writes 32 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                        int dst_width, int x, int dx) {
   __asm {
     mov        edx, [esp + 4]    // dst_ptr
     mov        eax, [esp + 8]    // src_ptr
     mov        ecx, [esp + 12]   // dst_width
 
     align      4
@@ -903,17 +903,17 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, c
     jg         wloop
 
     ret
   }
 }
 
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                             ptrdiff_t src_stride,
                             uint8* dst_argb, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_argb
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
@@ -930,17 +930,17 @@ void ScaleARGBRowDown2_SSE2(const uint8*
     jg         wloop
 
     ret
   }
 }
 
 // Blends 8x1 rectangle to 4x1.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
                                   ptrdiff_t src_stride,
                                   uint8* dst_argb, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_argb
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
@@ -960,17 +960,17 @@ void ScaleARGBRowDown2Linear_SSE2(const 
     jg         wloop
 
     ret
   }
 }
 
 // Blends 8x2 rectangle to 4x1.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                ptrdiff_t src_stride,
                                uint8* dst_argb, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_argb
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_argb
@@ -996,17 +996,17 @@ void ScaleARGBRowDown2Box_SSE2(const uin
 
     pop        esi
     ret
   }
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_stepx,
                                uint8* dst_argb, int dst_width) {
   __asm {
     push       ebx
     push       edi
     mov        eax, [esp + 8 + 4]    // src_argb
                                      // src_stride ignored
@@ -1034,17 +1034,17 @@ void ScaleARGBRowDownEven_SSE2(const uin
     pop        edi
     pop        ebx
     ret
   }
 }
 
 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                   ptrdiff_t src_stride,
                                   int src_stepx,
                                   uint8* dst_argb, int dst_width) {
   __asm {
     push       ebx
     push       esi
     push       edi
@@ -1083,17 +1083,17 @@ void ScaleARGBRowDownEvenBox_SSE2(const 
     pop        edi
     pop        esi
     pop        ebx
     ret
   }
 }
 
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                         int dst_width, int x, int dx) {
   __asm {
     push       edi
     push       esi
     mov        edi, [esp + 8 + 4]    // dst_argb
     mov        esi, [esp + 8 + 8]    // src_argb
     mov        ecx, [esp + 8 + 12]   // dst_width
@@ -1177,17 +1177,17 @@ static uvec8 kShuffleColARGB = {
   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
 static uvec8 kShuffleFractions = {
   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                                int dst_width, int x, int dx) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]    // dst_argb
     mov        esi, [esp + 8 + 8]    // src_argb
     mov        ecx, [esp + 8 + 12]   // dst_width
@@ -1252,17 +1252,17 @@ void ScaleARGBFilterCols_SSSE3(uint8* ds
     pop        edi
     pop        esi
     ret
   }
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int x, int dx) {
   __asm {
     mov        edx, [esp + 4]    // dst_argb
     mov        eax, [esp + 8]    // src_argb
     mov        ecx, [esp + 12]   // dst_width
 
     align      4
@@ -1278,30 +1278,30 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_ar
     lea        edx, [edx + 32]
     jg         wloop
 
     ret
   }
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv_X86(int num, int div) {
   __asm {
     mov        eax, [esp + 4]    // num
     cdq                          // extend num to 64 bits
     shld       edx, eax, 16      // 32.16
     shl        eax, 16
     idiv       dword ptr [esp + 8]
     ret
   }
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv1_X86(int num, int div) {
   __asm {
     mov        eax, [esp + 4]    // num
     mov        ecx, [esp + 8]    // denom
     cdq                          // extend num to 64 bits
     shld       edx, eax, 16      // 32.16
     shl        eax, 16
     sub        eax, 0x00010001