Bug 926838 - [Part 1] Add new files, and update license file. r=gerv,ehsan
☠☠ backed out by 22416e91c2dc ☠ ☠
authorJW Wang <jwwang@mozilla.com>
Wed, 13 Nov 2013 11:07:24 +0800
changeset 273716 abc86341cd753a9b02fb8a1b7ccd2757bf41511b
parent 273715 a91f9bcf83e83bda4e5bcd485c982859be34766a
child 273717 2456dfeb5f9cb187277c28d533f618ffaefd6af6
push id863
push userraliiev@mozilla.com
push dateMon, 03 Aug 2015 13:22:43 +0000
treeherdermozilla-release@f6321b14228d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersgerv, ehsan
bugs926838
milestone40.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 926838 - [Part 1] Add new files, and update license file. r=gerv,ehsan
media/openmax_dl/LICENSE
media/openmax_dl/OWNERS
media/openmax_dl/README.chromium
media/openmax_dl/dl/api/armCOMM_s.h
media/openmax_dl/dl/api/armOMX.h
media/openmax_dl/dl/api/omxtypes.h
media/openmax_dl/dl/api/omxtypes_s.h
media/openmax_dl/dl/sp/api/armSP.h
media/openmax_dl/dl/sp/api/omxSP.h
media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
media/openmax_dl/dl/sp/src/armSP_FFT_F32TwiddleTable.c
media/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c
media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c
media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c
media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c
media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c
media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c
media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c
media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c
media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c
media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c
media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c
media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c
media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c
media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S
toolkit/content/license.html
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/LICENSE
@@ -0,0 +1,39 @@
+Use of this source code is governed by a BSD-style license that can be
+found in the LICENSE file in the root of the source tree. All
+contributing project authors may be found in the AUTHORS file in the
+root of the source tree.
+
+The files were originally licensed by ARM Limited.
+
+The following files:
+
+    * dl/api/omxtypes.h
+    * dl/sp/api/omxSP.h
+
+are licensed by Khronos:
+
+Copyright (c) 2005-2008,2015 The Khronos Group Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and/or associated documentation files (the
+"Materials"), to deal in the Materials without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Materials, and to
+permit persons to whom the Materials are furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Materials.
+
+MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+   https://www.khronos.org/registry/
+
+THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/OWNERS
@@ -0,0 +1,3 @@
+ajm@google.com
+kma@google.com
+rtoy@google.com
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/README.chromium
@@ -0,0 +1,19 @@
+Name: OpenMAX DL
+Short Name: OpenMax DL
+URL: https://silver.arm.com/download/Software/Graphics/OX000-BU-00010-r1p0-00bet0/OX000-BU-00010-r1p0-00bet0.tgz
+Version: 1.0.2
+License: BSD
+License File: LICENSE
+Security Critical: yes
+
+Description:
+Implementation of OpenMAX DL spec from ARM.  This is used to support
+WebAudio for Chromium on Android.
+
+Local Modifications:
+Only the FFT routines from the OpenMAX DL package are included.  The
+code was modified to work with gcc and a new implementation for a
+floating-point FFT was added.
+
+The original ARM license is unclear, but Google has obtained
+permission to relicense this code under a BSD license.
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/api/armCOMM_s.h
@@ -0,0 +1,409 @@
+@// -*- Mode: asm; -*-
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+	
+@// 
+@// File Name:  armCOMM_s.h
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   13871
+@// Last Modified Date:       Fri, 09 May 2008
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// ARM optimized OpenMAX common header file
+@//
+
+	.set	_SBytes, 0	@ Number of scratch bytes on stack
+	.set	_Workspace, 0	@ Stack offset of scratch workspace
+
+	.set	_RRegList, 0	@ R saved register list (last register number)
+	.set	_DRegList, 0	@ D saved register list (last register number)
+
+        @// Work out a list of R saved registers, and how much stack space is needed.
+	@// gas doesn't support setting a variable to a string, so we set _RRegList to 
+	@// the register number.
+	.macro	_M_GETRREGLIST	rreg
+	.ifeqs "\rreg", ""
+	@ Nothing needs to be saved
+	.exitm
+	.endif
+	@ If rreg is lr or r4, save lr and r4
+	.ifeqs "\rreg", "lr"
+	.set	_RRegList, 4
+	.exitm
+	.endif
+
+	.ifeqs "\rreg", "r4"
+	.set	_RRegList, 4
+	.exitm
+	.endif
+
+	@ If rreg = r5 or r6, save up to register r6
+	.ifeqs "\rreg", "r5"
+	.set	_RRegList, 6
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r6"
+	.set	_RRegList, 6
+	.exitm
+	.endif
+
+	@ If rreg = r7 or r8, save up to register r8
+	.ifeqs "\rreg", "r7"
+	.set	_RRegList, 8
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r8"
+	.set	_RRegList, 8
+	.exitm
+	.endif
+
+	@ If rreg = r9 or r10, save up to register r10
+	.ifeqs "\rreg", "r9"
+	.set	_RRegList, 10
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r10"
+	.set	_RRegList, 10
+	.exitm
+	.endif
+
+	@ If rreg = r11 or r12, save up to register r12
+	.ifeqs "\rreg", "r11"
+	.set	_RRegList, 12
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r12"
+	.set	_RRegList, 12
+	.exitm
+	.endif
+
+	.warning "Unrecognized saved r register limit: \rreg"
+	.endm
+
+	@ Work out list of D saved registers, like for R registers.
+	.macro	_M_GETDREGLIST dreg
+	.ifeqs "\dreg", ""
+	.set	_DRegList, 0
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d8"
+	.set	_DRegList, 8
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d9"
+	.set	_DRegList, 9
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d10"
+	.set	_DRegList, 10
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d11"
+	.set	_DRegList, 11
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d12"
+	.set	_DRegList, 12
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d13"
+	.set	_DRegList, 13
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d14"
+	.set	_DRegList, 14
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d15"
+	.set	_DRegList, 15
+	.exitm
+	.endif
+
+	.warning "Unrecognized saved d register limit: \rreg"
+	.endm
+
+@//////////////////////////////////////////////////////////
+@// Function header and footer macros
+@//////////////////////////////////////////////////////////      
+	
+        @ Function Header Macro    
+        @ Generates the function prologue
+        @ Note that functions should all be "stack-moves-once"
+        @ The FNSTART and FNEND macros should be the only places
+        @ where the stack moves.
+        @    
+        @  name  = function name
+        @  rreg  = ""   don't stack any registers
+        @          "lr" stack "lr" only
+        @          "rN" stack registers "r4-rN,lr"
+        @  dreg  = ""   don't stack any D registers
+        @          "dN" stack registers "d8-dN"
+        @
+        @ Note: ARM Archicture procedure call standard AAPCS
+        @ states that r4-r11, sp, d8-d15 must be preserved by
+        @ a compliant function.
+	.macro	M_START name, rreg, dreg
+	.set	_Workspace, 0
+
+	@ Define the function and make it external.
+	.global	\name
+	.func	\name
+	.section	.text.\name,"ax",%progbits
+	.align	2
+\name :		
+.fnstart
+	@ Save specified R registers
+	_M_GETRREGLIST	\rreg
+	_M_PUSH_RREG
+
+	@ Save specified D registers
+        _M_GETDREGLIST  \dreg
+	_M_PUSH_DREG
+
+	@ Ensure size claimed on stack is 8-byte aligned
+	.if (_SBytes & 7) != 0
+	.set	_SBytes, _SBytes + (8 - (_SBytes & 7))
+	.endif
+	.if _SBytes != 0
+		sub	sp, sp, #_SBytes
+	.endif	
+	.endm
+
+        @ Function Footer Macro        
+        @ Generates the function epilogue
+	.macro M_END
+	@ Restore the stack pointer to its original value on function entry
+	.if _SBytes != 0
+		add	sp, sp, #_SBytes
+	.endif
+	@ Restore any saved R or D registers.
+	_M_RET
+	.fnend	
+	.endfunc
+        @ Reset the global stack tracking variables back to their
+	@ initial values.
+	.set _SBytes, 0
+	.endm
+
+	@// Based on the value of _DRegList, push the specified set of registers 
+	@// to the stack.  Is there a better way?
+	.macro _M_PUSH_DREG
+	.if _DRegList == 8
+		vpush	{d8}
+	.exitm
+	.endif
+	
+	.if _DRegList == 9
+		vpush	{d8-d9}
+	.exitm
+	.endif
+	
+	.if _DRegList == 10
+		vpush	{d8-d10}
+	.exitm
+	.endif
+	
+	.if _DRegList == 11
+		vpush	{d8-d11}
+	.exitm
+	.endif
+	
+	.if _DRegList == 12
+		vpush	{d8-d12}
+	.exitm
+	.endif
+	
+	.if _DRegList == 13
+		vpush	{d8-d13}
+	.exitm
+	.endif
+	
+	.if _DRegList == 14
+		vpush	{d8-d14}
+	.exitm
+	.endif
+	
+	.if _DRegList == 15
+		vpush	{d8-d15}
+	.exitm
+	.endif
+	.endm
+
+	@// Based on the value of _RRegList, push the specified set of registers 
+	@// to the stack.  Is there a better way?
+	.macro _M_PUSH_RREG
+	.if _RRegList == 4
+		stmfd	sp!, {r4, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 6
+		stmfd	sp!, {r4-r6, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 8
+		stmfd	sp!, {r4-r8, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 10
+		stmfd	sp!, {r4-r10, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 12
+		stmfd	sp!, {r4-r12, lr}
+	.exitm
+	.endif
+	.endm
+
+	@// The opposite of _M_PUSH_DREG
+	.macro  _M_POP_DREG
+	.if _DRegList == 8
+		vpop	{d8}
+	.exitm
+	.endif
+	
+	.if _DRegList == 9
+		vpop	{d8-d9}
+	.exitm
+	.endif
+	
+	.if _DRegList == 10
+		vpop	{d8-d10}
+	.exitm
+	.endif
+	
+	.if _DRegList == 11
+		vpop	{d8-d11}
+	.exitm
+	.endif
+	
+	.if _DRegList == 12
+		vpop	{d8-d12}
+	.exitm
+	.endif
+	
+	.if _DRegList == 13
+		vpop	{d8-d13}
+	.exitm
+	.endif
+	
+	.if _DRegList == 14
+		vpop	{d8-d14}
+	.exitm
+	.endif
+	
+	.if _DRegList == 15
+		vpop	{d8-d15}
+	.exitm
+	.endif
+	.endm
+
+	@// The opposite of _M_PUSH_RREG
+	.macro _M_POP_RREG cc
+	.if _RRegList == 0
+		bx\cc lr
+	.exitm
+	.endif
+	.if _RRegList == 4
+		ldm\cc\()fd	sp!, {r4, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 6
+		ldm\cc\()fd	sp!, {r4-r6, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 8
+		ldm\cc\()fd	sp!, {r4-r8, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 10
+		ldm\cc\()fd	sp!, {r4-r10, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 12
+		ldm\cc\()fd	sp!, {r4-r12, pc}
+	.exitm
+	.endif
+	.endm
+	
+        @ Produce function return instructions
+	.macro	_M_RET cc
+	_M_POP_DREG \cc
+	_M_POP_RREG \cc
+	.endm	
+	
+        @// Allocate 4-byte aligned area of name
+        @// |name| and size |size| bytes.
+	.macro	M_ALLOC4 name, size
+	.if	(_SBytes & 3) != 0
+	.set	_SBytes, _SBytes + (4 - (_SBytes & 3))
+	.endif
+	.set	\name\()_F, _SBytes
+	.set	_SBytes, _SBytes + \size
+	
+	.endm
+
+        @ Load word from stack
+	.macro M_LDR r, a0, a1, a2, a3
+	_M_DATA "ldr", 4, \r, \a0, \a1, \a2, \a3
+	.endm
+
+        @ Store word to stack
+	.macro M_STR r, a0, a1, a2, a3
+	_M_DATA "str", 4, \r, \a0, \a1, \a2, \a3
+	.endm
+
+        @ Macro to perform a data access operation
+        @ Such as LDR or STR
+        @ The addressing mode is modified such that
+        @ 1. If no address is given then the name is taken
+        @    as a stack offset
+        @ 2. If the addressing mode is not available for the
+        @    state being assembled for (eg Thumb) then a suitable
+        @    addressing mode is substituted.
+        @
+        @ On Entry:
+        @ $i = Instruction to perform (eg "LDRB")
+        @ $a = Required byte alignment
+        @ $r = Register(s) to transfer (eg "r1")
+        @ $a0,$a1,$a2. Addressing mode and condition. One of:
+        @     label {,cc}
+        @     [base]                    {,,,cc}
+        @     [base, offset]{!}         {,,cc}
+        @     [base, offset, shift]{!}  {,cc}
+        @     [base], offset            {,,cc}
+        @     [base], offset, shift     {,cc}
+	@
+	@ WARNING: Most of the above are not supported, except the first case.
+	.macro _M_DATA i, a, r, a0, a1, a2, a3
+	.set	_Offset, _Workspace + \a0\()_F
+	\i\a1	\r, [sp, #_Offset]	
+	.endm
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/api/armOMX.h
@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/* 
+ * 
+ * File Name:  armOMX_ReleaseVersion.h
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   15322
+ * Last Modified Date:       Wed, 15 Oct 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * This file allows a version of the OMX DL libraries to be built where some or
+ * all of the function names can be given a user specified suffix. 
+ *
+ * You might want to use it where:
+ *
+ * - you want to rename a function "out of the way" so that you could replace
+ *   a function with a different version (the original version would still be
+ *   in the library just with a different name - so you could debug the new
+ *   version by comparing it to the output of the old)
+ *
+ * - you want to rename all the functions to versions with a suffix so that 
+ *   you can include two versions of the library and choose between functions
+ *   at runtime.
+ *
+ *     e.g. omxIPBM_Copy_U8_C1R could be renamed omxIPBM_Copy_U8_C1R_CortexA8
+ * 
+ */
+
+  
+#ifndef _armOMX_H_
+#define _armOMX_H_
+
+#define ARMOMX_ENABLE_RENAMING 0
+#if ARMOMX_ENABLE_RENAMING
+
+/* We need to define these two macros in order to expand and concatenate the names */
+#define OMXCAT2BAR(A, B) omx ## A ## B
+#define OMXCATBAR(A, B) OMXCAT2BAR(A, B)
+
+/* Define the suffix to add to all functions - the default is no suffix */
+#define BARE_SUFFIX 
+
+
+
+/* Define what happens to the bare suffix-less functions, down to the sub-domain accuracy */
+#define OMXACAAC_SUFFIX    BARE_SUFFIX   
+#define OMXACMP3_SUFFIX    BARE_SUFFIX
+#define OMXICJP_SUFFIX     BARE_SUFFIX
+#define OMXIPBM_SUFFIX     BARE_SUFFIX
+#define OMXIPCS_SUFFIX     BARE_SUFFIX
+#define OMXIPPP_SUFFIX     BARE_SUFFIX
+#define OMXSP_SUFFIX       BARE_SUFFIX
+#define OMXVCCOMM_SUFFIX   BARE_SUFFIX
+#define OMXVCM4P10_SUFFIX  BARE_SUFFIX
+#define OMXVCM4P2_SUFFIX   BARE_SUFFIX
+
+
+
+
+/* Define what the each bare, un-suffixed OpenMAX API function names is to be renamed */
+#define omxACAAC_DecodeChanPairElt                        OMXCATBAR(ACAAC_DecodeChanPairElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeDatStrElt                          OMXCATBAR(ACAAC_DecodeDatStrElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeFillElt                            OMXCATBAR(ACAAC_DecodeFillElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeIsStereo_S32                       OMXCATBAR(ACAAC_DecodeIsStereo_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsPNS_S32_I                        OMXCATBAR(ACAAC_DecodeMsPNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsStereo_S32_I                     OMXCATBAR(ACAAC_DecodeMsStereo_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodePrgCfgElt                          OMXCATBAR(ACAAC_DecodePrgCfgElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeTNS_S32_I                          OMXCATBAR(ACAAC_DecodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DeinterleaveSpectrum_S32                 OMXCATBAR(ACAAC_DeinterleaveSpectrum_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_EncodeTNS_S32_I                          OMXCATBAR(ACAAC_EncodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermPredict_S32                      OMXCATBAR(ACAAC_LongTermPredict_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermReconstruct_S32_I                OMXCATBAR(ACAAC_LongTermReconstruct_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTFwd_S32                              OMXCATBAR(ACAAC_MDCTFwd_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTInv_S32_S16                          OMXCATBAR(ACAAC_MDCTInv_S32_S16, OMXACAAC_SUFFIX)
+#define omxACAAC_NoiselessDecode                          OMXCATBAR(ACAAC_NoiselessDecode, OMXACAAC_SUFFIX)
+#define omxACAAC_QuantInv_S32_I                           OMXCATBAR(ACAAC_QuantInv_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADIFHeader                         OMXCATBAR(ACAAC_UnpackADIFHeader, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADTSFrameHeader                    OMXCATBAR(ACAAC_UnpackADTSFrameHeader, OMXACAAC_SUFFIX)
+
+
+#define omxACMP3_HuffmanDecode_S32                        OMXCATBAR(ACMP3_HuffmanDecode_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfb_S32                     OMXCATBAR(ACMP3_HuffmanDecodeSfb_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfbMbp_S32                  OMXCATBAR(ACMP3_HuffmanDecodeSfbMbp_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_MDCTInv_S32                              OMXCATBAR(ACMP3_MDCTInv_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantize_S32_I                         OMXCATBAR(ACMP3_ReQuantize_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantizeSfb_S32_I                      OMXCATBAR(ACMP3_ReQuantizeSfb_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_SynthPQMF_S32_S16                        OMXCATBAR(ACMP3_SynthPQMF_S32_S16, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackFrameHeader                        OMXCATBAR(ACMP3_UnpackFrameHeader, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackScaleFactors_S8                    OMXCATBAR(ACMP3_UnpackScaleFactors_S8, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackSideInfo                           OMXCATBAR(ACMP3_UnpackSideInfo, OMXACMP3_SUFFIX)
+
+#define omxICJP_CopyExpand_U8_C3                          OMXCATBAR(ICJP_CopyExpand_U8_C3, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16                                OMXCATBAR(ICJP_DCTFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16_I                              OMXCATBAR(ICJP_DCTFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16                                OMXCATBAR(ICJP_DCTInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16_I                              OMXCATBAR(ICJP_DCTInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantFwd_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16                           OMXCATBAR(ICJP_DCTQuantFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16_I                         OMXCATBAR(ICJP_DCTQuantFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwdTableInit                      OMXCATBAR(ICJP_DCTQuantFwdTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantInv_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16                           OMXCATBAR(ICJP_DCTQuantInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16_I                         OMXCATBAR(ICJP_DCTQuantInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInvTableInit                      OMXCATBAR(ICJP_DCTQuantInvTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffman8x8_Direct_S16_C1            OMXCATBAR(ICJP_DecodeHuffman8x8_Direct_S16_C1, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_DecodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_DecodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffman8x8_Direct_S16_U1_C1         OMXCATBAR(ICJP_EncodeHuffman8x8_Direct_S16_U1_C1, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_EncodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_EncodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+
+#define omxIPBM_AddC_U8_C1R_Sfs                           OMXCATBAR(IPBM_AddC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C1R                               OMXCATBAR(IPBM_Copy_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C3R                               OMXCATBAR(IPBM_Copy_U8_C3R, OMXIPBM_SUFFIX)
+#define omxIPBM_Mirror_U8_C1R                             OMXCATBAR(IPBM_Mirror_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_MulC_U8_C1R_Sfs                           OMXCATBAR(IPBM_MulC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+
+#define omxIPCS_ColorTwistQ14_U8_C3R                      OMXCATBAR(IPCS_ColorTwistQ14_U8_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr420RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr420RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr420ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr422RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R          OMXCATBAR(IPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr422RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR565_U8_U16_C2C3R            OMXCATBAR(IPCS_YCbYCr422ToBGR565_U8_U16_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR888_U8_C2C3R                OMXCATBAR(IPCS_YCbYCr422ToBGR888_U8_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R        OMXCATBAR(IPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToYCbCr420Rotate_U8_P3R           OMXCATBAR(IPCS_YCbCr422ToYCbCr420Rotate_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_C3R               OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR888_U8_C3R                   OMXCATBAR(IPCS_YCbCr444ToBGR888_U8_C3R, OMXIPCS_SUFFIX)
+
+#define omxIPPP_Deblock_HorEdge_U8_I                      OMXCATBAR(IPPP_Deblock_HorEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_Deblock_VerEdge_U8_I                      OMXCATBAR(IPPP_Deblock_VerEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterFIR_U8_C1R                          OMXCATBAR(IPPP_FilterFIR_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterMedian_U8_C1R                       OMXCATBAR(IPPP_FilterMedian_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_GetCentralMoment_S64                      OMXCATBAR(IPPP_GetCentralMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_GetSpatialMoment_S64                      OMXCATBAR(IPPP_GetSpatialMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentGetStateSize                        OMXCATBAR(IPPP_MomentGetStateSize, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentInit                                OMXCATBAR(IPPP_MomentInit, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C1R                            OMXCATBAR(IPPP_Moments_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C3R                            OMXCATBAR(IPPP_Moments_U8_C3R, OMXIPPP_SUFFIX)
+
+#define omxSP_BlockExp_S16                                OMXCATBAR(SP_BlockExp_S16, OMXSP_SUFFIX)
+#define omxSP_BlockExp_S32                                OMXCATBAR(SP_BlockExp_S32, OMXSP_SUFFIX)
+#define omxSP_Copy_S16                                    OMXCATBAR(SP_Copy_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16                                 OMXCATBAR(SP_DotProd_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16_Sfs                             OMXCATBAR(SP_DotProd_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S16S32_Sfs                    OMXCATBAR(SP_FFTFwd_RToCCS_S16S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S32_Sfs                       OMXCATBAR(SP_FFTFwd_RToCCS_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC16                        OMXCATBAR(SP_FFTGetBufSize_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC32                        OMXCATBAR(SP_FFTGetBufSize_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S16S32                      OMXCATBAR(SP_FFTGetBufSize_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S32                         OMXCATBAR(SP_FFTGetBufSize_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC16                              OMXCATBAR(SP_FFTInit_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC32                              OMXCATBAR(SP_FFTInit_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S16S32                            OMXCATBAR(SP_FFTInit_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S32                               OMXCATBAR(SP_FFTInit_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32_Sfs                       OMXCATBAR(SP_FFTInv_CCSToR_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32S16_Sfs                    OMXCATBAR(SP_FFTInv_CCSToR_S32S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32                            OMXCATBAR(SP_FilterMedian_S32, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32_I                          OMXCATBAR(SP_FilterMedian_S32_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16                              OMXCATBAR(SP_FIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_I                            OMXCATBAR(SP_FIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_ISfs                         OMXCATBAR(SP_FIR_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_Sfs                          OMXCATBAR(SP_FIR_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16                           OMXCATBAR(SP_FIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_I                         OMXCATBAR(SP_FIROne_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_ISfs                      OMXCATBAR(SP_FIROne_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_Sfs                       OMXCATBAR(SP_FIROne_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16                        OMXCATBAR(SP_IIR_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16_I                      OMXCATBAR(SP_IIR_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16                              OMXCATBAR(SP_IIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16_I                            OMXCATBAR(SP_IIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16                     OMXCATBAR(SP_IIROne_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16_I                   OMXCATBAR(SP_IIROne_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16                           OMXCATBAR(SP_IIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16_I                         OMXCATBAR(SP_IIROne_Direct_S16_I, OMXSP_SUFFIX)
+
+#define omxVCCOMM_Average_16x                             OMXCATBAR(VCCOMM_Average_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Average_8x                              OMXCATBAR(VCCOMM_Average_8x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock                OMXCATBAR(VCCOMM_ComputeTextureErrorBlock, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock_SAD            OMXCATBAR(VCCOMM_ComputeTextureErrorBlock_SAD, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy16x16                               OMXCATBAR(VCCOMM_Copy16x16, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy8x8                                 OMXCATBAR(VCCOMM_Copy8x8, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ExpandFrame_I                           OMXCATBAR(VCCOMM_ExpandFrame_I, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_LimitMVToRect                           OMXCATBAR(VCCOMM_LimitMVToRect, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_16x                                 OMXCATBAR(VCCOMM_SAD_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_8x                                  OMXCATBAR(VCCOMM_SAD_8x, OMXVCCOMM_SUFFIX)
+
+#define omxVCM4P10_Average_4x                             OMXCATBAR(VCM4P10_Average_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Half                        OMXCATBAR(VCM4P10_BlockMatch_Half, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Integer                     OMXCATBAR(VCM4P10_BlockMatch_Integer, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Quarter                     OMXCATBAR(VCM4P10_BlockMatch_Quarter, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockChroma_I                        OMXCATBAR(VCM4P10_DeblockChroma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockLuma_I                          OMXCATBAR(VCM4P10_DeblockLuma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC        OMXCATBAR(VCM4P10_DecodeChromaDcCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeCoeffsToPairCAVLC                OMXCATBAR(VCM4P10_DecodeCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DequantTransformResidualFromPairAndAdd OMXCATBAR(VCM4P10_DequantTransformResidualFromPairAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_HorEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_VerEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_HorEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_VerEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_GetVLCInfo                             OMXCATBAR(VCM4P10_GetVLCInfo, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateChroma                      OMXCATBAR(VCM4P10_InterpolateChroma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfHor_Luma                OMXCATBAR(VCM4P10_InterpolateHalfHor_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfVer_Luma                OMXCATBAR(VCM4P10_InterpolateHalfVer_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateLuma                        OMXCATBAR(VCM4P10_InterpolateLuma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_ChromaDC           OMXCATBAR(VCM4P10_InvTransformDequant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_LumaDC             OMXCATBAR(VCM4P10_InvTransformDequant_LumaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformResidualAndAdd             OMXCATBAR(VCM4P10_InvTransformResidualAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEGetBufSize                           OMXCATBAR(VCM4P10_MEGetBufSize, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEInit                                 OMXCATBAR(VCM4P10_MEInit, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MotionEstimationMB                     OMXCATBAR(VCM4P10_MotionEstimationMB, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_16x16                     OMXCATBAR(VCM4P10_PredictIntra_16x16, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_4x4                       OMXCATBAR(VCM4P10_PredictIntra_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntraChroma_8x8                  OMXCATBAR(VCM4P10_PredictIntraChroma_8x8, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SAD_4x                                 OMXCATBAR(VCM4P10_SAD_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_16x                            OMXCATBAR(VCM4P10_SADQuar_16x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_4x                             OMXCATBAR(VCM4P10_SADQuar_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_8x                             OMXCATBAR(VCM4P10_SADQuar_8x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SATD_4x4                               OMXCATBAR(VCM4P10_SATD_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SubAndTransformQDQResidual             OMXCATBAR(VCM4P10_SubAndTransformQDQResidual, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantChromaDCFromPair       OMXCATBAR(VCM4P10_TransformDequantChromaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantLumaDCFromPair         OMXCATBAR(VCM4P10_TransformDequantLumaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_ChromaDC                OMXCATBAR(VCM4P10_TransformQuant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_LumaDC                  OMXCATBAR(VCM4P10_TransformQuant_LumaDC, OMXVCM4P10_SUFFIX)
+
+#define omxVCM4P2_BlockMatch_Half_16x16                   OMXCATBAR(VCM4P2_BlockMatch_Half_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Half_8x8                     OMXCATBAR(VCM4P2_BlockMatch_Half_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_16x16                OMXCATBAR(VCM4P2_BlockMatch_Integer_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_8x8                  OMXCATBAR(VCM4P2_BlockMatch_Integer_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DCT8x8blk                               OMXCATBAR(VCM4P2_DCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Inter                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Intra                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Intra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodePadMV_PVOP                        OMXCATBAR(VCM4P2_DecodePadMV_PVOP, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_DecodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeMV                                OMXCATBAR(VCM4P2_EncodeMV, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_EncodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_FindMVpred                              OMXCATBAR(VCM4P2_FindMVpred, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_IDCT8x8blk                              OMXCATBAR(VCM4P2_IDCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MCReconBlock                            OMXCATBAR(VCM4P2_MCReconBlock, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEGetBufSize                            OMXCATBAR(VCM4P2_MEGetBufSize, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEInit                                  OMXCATBAR(VCM4P2_MEInit, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MotionEstimationMB                      OMXCATBAR(VCM4P2_MotionEstimationMB, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_PredictReconCoefIntra                   OMXCATBAR(VCM4P2_PredictReconCoefIntra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInter_I                            OMXCATBAR(VCM4P2_QuantInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantIntra_I                            OMXCATBAR(VCM4P2_QuantIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvInter_I                         OMXCATBAR(VCM4P2_QuantInvInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvIntra_I                         OMXCATBAR(VCM4P2_QuantInvIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_inter                 OMXCATBAR(VCM4P2_TransRecBlockCoef_inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_intra                 OMXCATBAR(VCM4P2_TransRecBlockCoef_intra, OMXVCM4P2_SUFFIX)
+
+#endif /* endif ARMOMX_ENABLE_RENAMING */
+#endif /* _armOMX_h_ */
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/api/omxtypes.h
@@ -0,0 +1,254 @@
+/**
+ * File: omxtypes.h
+ * Brief: Defines basic Data types used in OpenMAX v1.0.2 header files.
+ *
+ * Copyright (c) 2005-2008,2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ *
+ */
+  
+#ifndef _OMXTYPES_H_
+#define _OMXTYPES_H_
+
+#include <limits.h> 
+
+#define OMX_IN
+#define OMX_OUT
+#define OMX_INOUT
+
+
+typedef enum {
+    
+    /* Mandatory return codes - use cases are explicitly described for each function */
+    OMX_Sts_NoErr                    =  0,    /* No error, the function completed successfully */
+    OMX_Sts_Err                      = -2,    /* Unknown/unspecified error */    
+    OMX_Sts_InvalidBitstreamValErr   = -182,  /* Invalid value detected during bitstream processing */    
+    OMX_Sts_MemAllocErr              = -9,    /* Not enough memory allocated for the operation */
+    OMX_StsACAAC_GainCtrErr    	     = -159,  /* AAC: Unsupported gain control data detected */
+    OMX_StsACAAC_PrgNumErr           = -167,  /* AAC: Invalid number of elements for one program   */
+    OMX_StsACAAC_CoefValErr          = -163,  /* AAC: Invalid quantized coefficient value          */     
+    OMX_StsACAAC_MaxSfbErr           = -162,  /* AAC: Invalid maxSfb value in relation to numSwb */    
+	OMX_StsACAAC_PlsDataErr		     = -160,  /* AAC: pulse escape sequence data error */
+
+    /* Optional return codes - use cases are explicitly described for each function*/
+    OMX_Sts_BadArgErr                = -5,    /* Bad Arguments */
+
+    OMX_StsACAAC_TnsNumFiltErr       = -157,  /* AAC: Invalid number of TNS filters  */
+    OMX_StsACAAC_TnsLenErr           = -156,  /* AAC: Invalid TNS region length  */   
+    OMX_StsACAAC_TnsOrderErr         = -155,  /* AAC: Invalid order of TNS filter  */                  
+    OMX_StsACAAC_TnsCoefResErr       = -154,  /* AAC: Invalid bit-resolution for TNS filter coefficients  */
+    OMX_StsACAAC_TnsCoefErr          = -153,  /* AAC: Invalid TNS filter coefficients  */                  
+    OMX_StsACAAC_TnsDirectErr        = -152,  /* AAC: Invalid TNS filter direction  */  
+
+    OMX_StsICJP_JPEGMarkerErr        = -183,  /* JPEG marker encountered within an entropy-coded block; */
+                                              /* Huffman decoding operation terminated early.           */
+    OMX_StsICJP_JPEGMarker           = -181,  /* JPEG marker encountered; Huffman decoding */
+                                              /* operation terminated early.                         */
+    OMX_StsIPPP_ContextMatchErr      = -17,   /* Context parameter doesn't match to the operation */
+
+    OMX_StsSP_EvenMedianMaskSizeErr  = -180,  /* Even size of the Median Filter mask was replaced by the odd one */
+
+    OMX_Sts_MaximumEnumeration       = INT_MAX  /*Placeholder, forces enum of size OMX_INT*/
+    
+ } OMXResult;          /** Return value or error value returned from a function. Identical to OMX_INT */
+
+ 
+/* OMX_U8 */
+#if UCHAR_MAX == 0xff
+typedef unsigned char OMX_U8;
+#elif USHRT_MAX == 0xff 
+typedef unsigned short int OMX_U8; 
+#else
+#error OMX_U8 undefined
+#endif 
+
+ 
+/* OMX_S8 */
+#if SCHAR_MAX == 0x7f 
+typedef signed char OMX_S8;
+#elif SHRT_MAX == 0x7f 
+typedef signed short int OMX_S8; 
+#else
+#error OMX_S8 undefined
+#endif
+ 
+ 
+/* OMX_U16 */
+#if USHRT_MAX == 0xffff
+typedef unsigned short int OMX_U16;
+#elif UINT_MAX == 0xffff
+typedef unsigned int OMX_U16; 
+#else
+#error OMX_U16 undefined
+#endif
+
+
+/* OMX_S16 */
+#if SHRT_MAX == 0x7fff 
+typedef signed short int OMX_S16;
+#elif INT_MAX == 0x7fff 
+typedef signed int OMX_S16; 
+#else
+#error OMX_S16 undefined
+#endif
+
+
+/* OMX_U32 */
+#if UINT_MAX == 0xffffffff
+typedef unsigned int OMX_U32;
+#elif LONG_MAX == 0xffffffff
+typedef unsigned long int OMX_U32; 
+#else
+#error OMX_U32 undefined
+#endif
+
+
+/* OMX_S32 */
+#if INT_MAX == 0x7fffffff
+typedef signed int OMX_S32;
+#elif LONG_MAX == 0x7fffffff
+typedef long signed int OMX_S32; 
+#else
+#error OMX_S32 undefined
+#endif
+
+
+/* OMX_U64 & OMX_S64 */
+#if defined( _WIN32 ) || defined ( _WIN64 )
+    typedef __int64 OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned __int64 OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000i64)
+    #define OMX_MIN_U64			(0x0000000000000000i64)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFi64)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFi64)
+#else
+    typedef long long OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned long long OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000LL)
+    #define OMX_MIN_U64			(0x0000000000000000LL)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFLL)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFLL)
+#endif
+
+
+/* OMX_SC8 */
+typedef struct
+{
+  OMX_S8 Re; /** Real part */
+  OMX_S8 Im; /** Imaginary part */	
+	
+} OMX_SC8; /** Signed 8-bit complex number */
+
+
+/* OMX_SC16 */
+typedef struct
+{
+  OMX_S16 Re; /** Real part */
+  OMX_S16 Im; /** Imaginary part */	
+	
+} OMX_SC16; /** Signed 16-bit complex number */
+
+
+/* OMX_SC32 */
+typedef struct
+{
+  OMX_S32 Re; /** Real part */
+  OMX_S32 Im; /** Imaginary part */	
+	
+} OMX_SC32; /** Signed 32-bit complex number */
+
+
+/* OMX_SC64 */
+typedef struct
+{
+  OMX_S64 Re; /** Real part */
+  OMX_S64 Im; /** Imaginary part */	
+	
+} OMX_SC64; /** Signed 64-bit complex number */
+
+
+/* OMX_F32 */
+typedef float OMX_F32; /** Single precision floating point,IEEE 754 */
+
+
+/* OMX_F64 */
+typedef double OMX_F64; /** Double precision floating point,IEEE 754 */
+
+
+/* OMX_INT */
+typedef int OMX_INT; /** signed integer corresponding to machine word length, has maximum signed value INT_MAX*/
+
+
+#define OMX_MIN_S8  	   	(-128)
+#define OMX_MIN_U8  		0
+#define OMX_MIN_S16		 	(-32768)
+#define OMX_MIN_U16			0
+#define OMX_MIN_S32			(-2147483647-1)
+#define OMX_MIN_U32			0
+
+#define OMX_MAX_S8			(127)
+#define OMX_MAX_U8			(255)
+#define OMX_MAX_S16			(32767)
+#define OMX_MAX_U16			(0xFFFF)
+#define OMX_MAX_S32			(2147483647)
+#define OMX_MAX_U32			(0xFFFFFFFF)
+
+typedef void OMXVoid;
+
+#ifndef NULL
+#define NULL ((void*)0)
+#endif
+
+/** Defines the geometric position and size of a rectangle, 
+  * where x,y defines the coordinates of the top left corner
+  * of the rectangle, with dimensions width in the x-direction 
+  * and height in the y-direction */
+typedef struct {
+	OMX_INT x;      /** x-coordinate of top left corner of rectangle */
+	OMX_INT y;      /** y-coordinate of top left corner of rectangle */
+	OMX_INT width;  /** Width in the x-direction. */
+	OMX_INT height; /** Height in the y-direction. */
+}OMXRect;
+
+
+/** Defines the geometric position of a point, */
+typedef struct 
+{
+ OMX_INT x; /** x-coordinate */
+ OMX_INT y;	/** y-coordinate */
+	
+} OMXPoint;
+
+
+/** Defines the dimensions of a rectangle, or region of interest in an image */
+typedef struct 
+{
+ OMX_INT width;  /** Width of the rectangle, in the x-direction */
+ OMX_INT height; /** Height of the rectangle, in the y-direction */
+	
+} OMXSize;
+
+#endif /* _OMXTYPES_H_ */
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/api/omxtypes_s.h
@@ -0,0 +1,76 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  omxtypes_s.h
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   9622
+@// Last Modified Date:       Wed, 06 Feb 2008
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@//
+
+@// Mandatory return codes - use cases are explicitly described for each function 
+	.equ	OMX_Sts_NoErr, 0    @// No error the function completed successfully 
+	.equ	OMX_Sts_Err, -2    @// Unknown/unspecified error     
+	.equ	OMX_Sts_InvalidBitstreamValErr, -182  @// Invalid value detected during bitstream processing     
+	.equ	OMX_Sts_MemAllocErr, -9    @// Not enough memory allocated for the operation 
+	.equ	OMX_StsACAAC_GainCtrErr, -159  @// AAC: Unsupported gain control data detected 
+	.equ	OMX_StsACAAC_PrgNumErr, -167  @// AAC: Invalid number of elements for one program   
+	.equ	OMX_StsACAAC_CoefValErr, -163  @// AAC: Invalid quantized coefficient value               
+	.equ	OMX_StsACAAC_MaxSfbErr, -162  @// AAC: Invalid maxSfb value in relation to numSwb     
+	.equ	OMX_StsACAAC_PlsDataErr, -160  @// AAC: pulse escape sequence data error 
+
+@// Optional return codes - use cases are explicitly described for each function
+	.equ	OMX_Sts_BadArgErr, -5    @// Bad Arguments 
+
+	.equ	OMX_StsACAAC_TnsNumFiltErr, -157  @// AAC: Invalid number of TNS filters  
+	.equ	OMX_StsACAAC_TnsLenErr, -156  @// AAC: Invalid TNS region length     
+	.equ	OMX_StsACAAC_TnsOrderErr, -155  @// AAC: Invalid order of TNS filter                    
+	.equ	OMX_StsACAAC_TnsCoefResErr, -154  @// AAC: Invalid bit-resolution for TNS filter coefficients  
+	.equ	OMX_StsACAAC_TnsCoefErr, -153  @// AAC: Invalid TNS filter coefficients                    
+	.equ	OMX_StsACAAC_TnsDirectErr, -152  @// AAC: Invalid TNS filter direction    
+	.equ	OMX_StsICJP_JPEGMarkerErr, -183  @// JPEG marker encountered within an entropy-coded block; 
+                                            @// Huffman decoding operation terminated early.           
+	.equ	OMX_StsICJP_JPEGMarker, -181  @// JPEG marker encountered; Huffman decoding 
+                                            @// operation terminated early.                         
+	.equ	OMX_StsIPPP_ContextMatchErr, -17   @// Context parameter doesn't match to the operation 
+
+	.equ	OMX_StsSP_EvenMedianMaskSizeErr, -180  @// Even size of the Median Filter mask was replaced by the odd one 
+
+	.equ	OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
+
+
+
+	.equ	OMX_MIN_S8, (-128)
+	.equ	OMX_MIN_U8, 0
+	.equ	OMX_MIN_S16, (-32768)
+	.equ	OMX_MIN_U16, 0
+
+
+	.equ	OMX_MIN_S32, (-2147483647-1)
+	.equ	OMX_MIN_U32, 0
+
+	.equ	OMX_MAX_S8, (127)
+	.equ	OMX_MAX_U8, (255)
+	.equ	OMX_MAX_S16, (32767)
+	.equ	OMX_MAX_U16, (0xFFFF)
+	.equ	OMX_MAX_S32, (2147483647)
+	.equ	OMX_MAX_U32, (0xFFFFFFFF)
+
+	.equ	OMX_VC_UPPER, 0x1                 @// Used by the PredictIntra functions   
+	.equ	OMX_VC_LEFT, 0x2                 @// Used by the PredictIntra functions 
+	.equ	OMX_VC_UPPER_RIGHT, 0x40          @// Used by the PredictIntra functions   
+
+	.equ	NULL, 0
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/api/armSP.h
@@ -0,0 +1,92 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  armSP.h
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   7014
+ * Last Modified Date:       Wed, 01 Aug 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *   
+ * File: armSP.h
+ * Brief: Declares API's/Basic Data types used across the OpenMAX Signal Processing domain
+ *
+ */
+#ifndef _armSP_H_
+#define _armSP_H_
+
+#include "dl/api/omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** FFT Specific declarations */
+extern  OMX_S32 armSP_FFT_S32TwiddleTable[1026];
+extern OMX_F32 armSP_FFT_F32TwiddleTable[];
+
+typedef struct  ARMsFFTSpec_SC32_Tag 
+{
+    OMX_U32     N;
+    OMX_U16     *pBitRev;    
+    OMX_SC32    *pTwiddle;
+    OMX_SC32    *pBuf;
+}ARMsFFTSpec_SC32;
+
+
+typedef struct  ARMsFFTSpec_SC16_Tag 
+{
+    OMX_U32     N;
+    OMX_U16     *pBitRev;    
+    OMX_SC16    *pTwiddle;
+    OMX_SC16    *pBuf;
+}ARMsFFTSpec_SC16;
+
+typedef struct  ARMsFFTSpec_R_SC32_Tag 
+{
+    OMX_U32     N;
+    OMX_U16     *pBitRev;    
+    OMX_SC32    *pTwiddle;
+    OMX_S32     *pBuf;
+}ARMsFFTSpec_R_SC32;
+
+typedef struct ARMsFFTSpec_R_FC32_Tag
+{
+    OMX_U32 N;
+    OMX_U16* pBitRev;
+    OMX_FC32* pTwiddle;
+    OMX_F32* pBuf;
+} ARMsFFTSpec_R_FC32;
+
+typedef struct ARMsFFTSpec_FC32_Tag
+{
+    OMX_U32 N;
+    OMX_U16* pBitRev;
+    OMX_FC32* pTwiddle;
+    OMX_FC32* pBuf;
+} ARMsFFTSpec_FC32;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/*End of File*/
+
+
+
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/api/omxSP.h
@@ -0,0 +1,2031 @@
+/**
+ * File: omxSP.h
+ * Brief: OpenMAX DL v1.0.2 - Signal Processing library
+ *
+ * Copyright (c) 2005-2008,2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ *
+ */
+
+/* *****************************************************************************************/
+
+#ifndef _OMXSP_H_
+#define _OMXSP_H_
+
+#include "omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* 2.1 Vendor-Specific FFT Data Structures */
+ typedef void OMXFFTSpec_C_SC16;
+ typedef void OMXFFTSpec_C_SC32;
+ typedef void OMXFFTSpec_R_S16S32;
+ typedef void OMXFFTSpec_R_S32;
+
+
+/**
+ * Function:  omxSP_Copy_S16   (2.2.1.1.1)
+ *
+ * Description:
+ * Copies the len elements of the vector pointed to by pSrcinto the len 
+ * elements of the vector pointed to by pDst. That is: 
+ *     pDst[i] = pSrc[i], for (i=0, 1, ..., len-1)
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the source vector 
+ *   len - number of elements contained in the source and destination vectors 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the destination vector 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments detected; returned if one or more of 
+ *              the following is true: 
+ *    -   pSrc or pDst is NULL 
+ *    -   len < 0 
+ *
+ */
+OMXResult omxSP_Copy_S16 (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_INT len
+);
+
+
+
+/**
+ * Function:  omxSP_DotProd_S16   (2.2.2.1.1)
+ *
+ * Description:
+ * Calculates the dot product of the two input vectors.  This function does 
+ * not perform scaling. The internal accumulator width must be at least 32 
+ * bits.  If any of the partially accumulated values exceeds the range of a 
+ * signed 32-bit integer then the result is undefined. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc1 - pointer to the first input vector; must be aligned on an 8-byte 
+ *            boundary. 
+ *   pSrc2 - pointer to the second input vector; must be aligned on an 8-byte 
+ *            boundary. 
+ *   len - length of the vectors in pSrc1 and pSrc2 
+ *
+ * Output Arguments:
+ *
+ * Return Value:
+ *    
+ *    The dot product result  Note: this function returns the actual result 
+ *              rather than the standard OMXError. 
+ *
+ */
+OMX_S32 omxSP_DotProd_S16 (
+    const OMX_S16 *pSrc1,
+    const OMX_S16 *pSrc2,
+    OMX_INT len
+);
+
+
+
+/**
+ * Function:  omxSP_DotProd_S16_Sfs   (2.2.2.1.2)
+ *
+ * Description:
+ * Calculates the dot product of the two input signals with output scaling 
+ * and saturation, i.e., the result is multiplied by two to the power of the 
+ * negative (-)scalefactor (scaled) prior to return.  The result is saturated 
+ * with rounding if the scaling operation produces a value outside the range 
+ * of a signed 32-bit integer. Rounding behavior is defined in section 1.6.7 
+ * Integer Scaling and Rounding Conventions. The internal accumulator width 
+ * must be at least 32 bits. The result is undefined if any of the partially 
+ * accumulated values exceeds the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc1 - pointer to the first input vector; must be aligned on an 8-byte 
+ *            boundary. 
+ *   pSrc2 - pointer to the second input vector; must be aligned on an 8-byte 
+ *            boundary. 
+ *   len - length of the vectors in pSrc1 and pSrc2 
+ *   scaleFactor - integer scalefactor 
+ *
+ * Output Arguments:
+ *
+ * Return Value:
+ *    
+ *    The dot product result  Note: This function returns the actual result 
+ *              rather than the standard OMXError. 
+ *
+ */
+OMX_S32 omxSP_DotProd_S16_Sfs (
+    const OMX_S16 *pSrc1,
+    const OMX_S16 *pSrc2,
+    OMX_INT len,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_BlockExp_S16   (2.2.2.2.2)
+ *
+ * Description:
+ * Block exponent calculation for 16-bit and 32-bit signals (count leading 
+ * sign bits). These functions compute the number of extra sign bits of all 
+ * values in the 16-bit and 32-bit input vector pSrc and return the minimum 
+ * sign bit count. This is also the maximum shift value that could be used in 
+ * scaling the block of data.  The functions BlockExp_S16 and 
+ * BlockExp_S32 return the values 15 and 31, respectively, for input vectors in 
+ * which all entries are equal to zero.  
+ *
+ * Note: These functions differ from other DL functions by not returning the 
+ *       standard OMXError but the actual result. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the input vector 
+ *   len - number of elements contained in the input and output 
+ *         vectors (0 < len < 65536) 
+ *
+ * Output Arguments:
+ *   
+ *   none 
+ *
+ * Return Value:
+ *    
+ *    Maximum exponent that may be used in scaling 
+ *
+ */
+OMX_INT omxSP_BlockExp_S16 (
+    const OMX_S16 *pSrc,
+    OMX_INT len
+);
+
+
+
+/**
+ * Function:  omxSP_BlockExp_S32   (2.2.2.2.2)
+ *
+ * Description:
+ * Block exponent calculation for 16-bit and 32-bit signals (count leading 
+ * sign bits). These functions compute the number of extra sign bits of all 
+ * values in the 16-bit and 32-bit input vector pSrc and return the minimum 
+ * sign bit count. This is also the maximum shift value that could be used in 
+ * scaling the block of data.  The functions BlockExp_S16 and 
+ * BlockExp_S32 return the values 15 and 31, respectively, for input vectors in 
+ * which all entries are equal to zero.  
+ * 
+ * Note: These functions differ from other DL functions by not returning the 
+ *       standard OMXError but the actual result. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the input vector 
+ *   len - number of elements contained in the input and output 
+ *         vectors (0 < len < 65536) 
+ *
+ * Output Arguments:
+ *   
+ *   none 
+ *
+ * Return Value:
+ *    
+ *    Maximum exponent that may be used in scaling 
+ *
+ */
+OMX_INT omxSP_BlockExp_S32 (
+    const OMX_S32 *pSrc,
+    OMX_INT len
+);
+
+
+
+/**
+ * Function:  omxSP_FIR_Direct_S16   (2.2.3.1.1)
+ *
+ * Description:
+ * Block FIR filtering for 16-bit data type.  This function applies the 
+ * FIR filter defined by the coefficient vector pTapsQ15 to a vector of 
+ * input data.  The result is saturated with rounding if the operation 
+ * produces a value outside the range of a signed 16-bit integer.  
+ * Rounding behavior is defined in:
+ *     section 1.6.7 "Integer Scaling and Rounding Conventions".  
+ * The internal accumulator width must be at least 32 bits.  The result 
+ * is undefined if any of the partially accumulated values exceeds the 
+ * range of a signed 32-bit integer. 
+ *
+ *
+ * Input Arguments:
+ *   
+ *   pSrc   - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   sampLen - the number of samples contained in the input and output 
+ *            vectors 
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                     0 = k <tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst   - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   One or more of the following pointers is NULL: 
+ *          -  pSrc, 
+ *          -  pDst, 
+ *          -  pSrcDst, 
+ *          -  pTapsQ15, 
+ *          -  pDelayLine, or 
+ *          -  pDelayLineIndex 
+ *    -   samplen < 0 
+ *    -   tapslen < 1 
+ *    -   *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen). 
+ *
+ */
+OMXResult omxSP_FIR_Direct_S16 (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_INT sampLen,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex
+);
+
+
+
+/**
+ * Function:  omxSP_FIR_Direct_S16_I   (2.2.3.1.1)
+ *
+ * Description:
+ * Block FIR filtering for 16-bit data type.  This function applies the 
+ * FIR filter defined by the coefficient vector pTapsQ15 to a vector of 
+ * input data.  The result is saturated with rounding if the operation 
+ * produces a value outside the range of a signed 16-bit integer.  
+ * Rounding behavior is defined in:
+ *     section 1.6.7 "Integer Scaling and Rounding Conventions".  
+ * The internal accumulator width must be at least 32 bits.  The result 
+ * is undefined if any of the partially accumulated values exceeds the 
+ * range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   sampLen - the number of samples contained in the input and output 
+ *            vectors 
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                     0 = k <tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   One or more of the following pointers is NULL: 
+ *          -  pSrc, 
+ *          -  pDst, 
+ *          -  pSrcDst, 
+ *          -  pTapsQ15, 
+ *          -  pDelayLine, or 
+ *          -  pDelayLineIndex 
+ *    -   samplen < 0 
+ *    -   tapslen < 1 
+ *    -   *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen). 
+ *
+ */
+OMXResult omxSP_FIR_Direct_S16_I (
+    OMX_S16 *pSrcDst,
+    OMX_INT sampLen,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex
+);
+
+
+
+/**
+ * Function:  omxSP_FIR_Direct_S16_Sfs   (2.2.3.1.1)
+ *
+ * Description:
+ * Block FIR filtering for 16-bit data type. This function applies 
+ * the FIR filter defined by the coefficient vector pTapsQ15 to a 
+ * vector of input data.  The output is multiplied by 2 to the negative 
+ * power of scalefactor (i.e., 2^-scalefactor) before returning to the caller.
+ * Scaling and rounding conventions are defined in section 1.6.7.  
+ * The internal accumulator width must be at least 32 bits.  
+ * The result is undefined if any of the partially accumulated 
+ * values exceeds the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc    - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   sampLen - the number of samples contained in the input and output 
+ *            vectors 
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                     0 = k <tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *   scaleFactor - saturation fixed scalefactor
+ *
+ * Output Arguments:
+ *   
+ *   pDst  - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   One or more of the following pointers is NULL: 
+ *          -  pSrc, 
+ *          -  pDst, 
+ *          -  pSrcDst, 
+ *          -  pTapsQ15, 
+ *          -  pDelayLine, or 
+ *          -  pDelayLineIndex 
+ *    -   samplen < 0 
+ *    -   tapslen < 1 
+ *    -   scaleFactor < 0 
+ *    -   *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen). 
+ *
+ */
+OMXResult omxSP_FIR_Direct_S16_Sfs (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_INT sampLen,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FIR_Direct_S16_ISfs   (2.2.3.1.1)
+ *
+ * Description:
+ * Block FIR filtering for 16-bit data type. This function applies 
+ * the FIR filter defined by the coefficient vector pTapsQ15 to a 
+ * vector of input data.  The output is multiplied by 2 to the negative 
+ * power of scalefactor (i.e., 2^-scalefactor) before returning to the caller.
+ * Scaling and rounding conventions are defined in section 1.6.7.  
+ * The internal accumulator width must be at least 32 bits.  
+ * The result is undefined if any of the partially accumulated 
+ * values exceeds the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   sampLen - the number of samples contained in the input and output 
+ *            vectors 
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                     0 = k <tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *   scaleFactor - saturation fixed scalefactor
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   One or more of the following pointers is NULL: 
+ *          -  pSrc, 
+ *          -  pDst, 
+ *          -  pSrcDst, 
+ *          -  pTapsQ15, 
+ *          -  pDelayLine, or 
+ *          -  pDelayLineIndex 
+ *    -   samplen < 0 
+ *    -   tapslen < 1 
+ *    -   scaleFactor < 0 
+ *    -   *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen). 
+ *
+ */
+OMXResult omxSP_FIR_Direct_S16_ISfs (
+    OMX_S16 *pSrcDst,
+    OMX_INT sampLen,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FIROne_Direct_S16   (2.2.3.1.2)
+ *
+ * Description:
+ * Single-sample FIR filtering for 16-bit data type. These functions apply 
+ * the FIR filter defined by the coefficient vector pTapsQ15 to a single 
+ * sample of input data. The result is saturated with rounding if the 
+ * operation produces a value outside the range of a signed 16-bit integer.  
+ * Rounding behavior is defined in:
+ *       section 1.6.7 "Integer Scaling and Rounding Conventions".  
+ * The internal accumulator width must be at least 32 bits.  The result is 
+ * undefined if any of the partially accumulated values exceeds the range of a 
+ * signed 32-bit integer.
+ *
+ * Input Arguments:
+ *   
+ *   val      - the single input sample to which the filter is 
+ *            applied.
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (as defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                         0 = k < tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *
+ * Output Arguments:
+ *   
+ *   pResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    One or more of the following pointers is NULL: 
+ *            -  pResult, 
+ *            -  pTapsQ15, 
+ *            -  pDelayLine, or 
+ *            -  pDelayLineIndex 
+ *    -    tapslen < 1 
+ *    -    *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen) 
+ *
+ */
+OMXResult omxSP_FIROne_Direct_S16 (
+    OMX_S16 val,
+    OMX_S16 *pResult,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex
+);
+
+
+
+/**
+ * Function:  omxSP_FIROne_Direct_S16_I   (2.2.3.1.2)
+ *
+ * Description:
+ * Single-sample FIR filtering for 16-bit data type. These functions apply 
+ * the FIR filter defined by the coefficient vector pTapsQ15 to a single 
+ * sample of input data. The result is saturated with rounding if the 
+ * operation produces a value outside the range of a signed 16-bit integer.  
+ * Rounding behavior is defined in:
+ *       section 1.6.7 "Integer Scaling and Rounding Conventions".  
+ * The internal accumulator width must be at least 32 bits.  The result is 
+ * undefined if any of the partially accumulated values exceeds the range of a 
+ * signed 32-bit integer.
+ *
+ * Input Arguments:
+ *   
+ *   pValResult - pointer to the single input sample to which the filter is 
+ *            applied. 
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (as defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                         0 = k < tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *
+ * Output Arguments:
+ *   
+ *   pValResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    One or more of the following pointers is NULL: 
+ *            -  pValResult, 
+ *            -  pTapsQ15, 
+ *            -  pDelayLine, or 
+ *            -  pDelayLineIndex 
+ *    -    tapslen < 1 
+ *    -    *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen) 
+ *
+ */
+OMXResult omxSP_FIROne_Direct_S16_I (
+    OMX_S16 *pValResult,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex
+);
+
+
+
+/**
+ * Function:  omxSP_FIROne_Direct_S16_Sfs   (2.2.3.1.2)
+ *
+ * Description:
+ * Single-sample FIR filtering for 16-bit data type. These functions apply 
+ * the FIR filter defined by the coefficient vector pTapsQ15 to a single 
+ * sample of input data. The output is multiplied by 2 to the negative power 
+ * of scalefactor (i.e., 2^-scalefactor) before returning to the user.  
+ * Scaling and rounding conventions are defined in section 1.6.7.  
+ * The internal accumulator width must be at least 32 bits.  
+ * The result is undefined if any of the partially accumulated values exceeds 
+ * the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   val      - the single input sample to which the filter is 
+ *            applied.  
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (as defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                         0 = k < tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *   scaleFactor - saturation fixed scaleFactor 
+ *
+ * Output Arguments:
+ *   
+ *   pResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    One or more of the following pointers is NULL: 
+ *            -  pResult, 
+ *            -  pTapsQ15, 
+ *            -  pDelayLine, or 
+ *            -  pDelayLineIndex 
+ *    -    tapslen < 1 
+ *    -    scaleFactor < 0 
+ *    -    *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen) 
+ *
+ */
+OMXResult omxSP_FIROne_Direct_S16_Sfs (
+    OMX_S16 val,
+    OMX_S16 *pResult,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FIROne_Direct_S16_ISfs   (2.2.3.1.2)
+ *
+ * Description:
+ * Single-sample FIR filtering for 16-bit data type. These functions apply 
+ * the FIR filter defined by the coefficient vector pTapsQ15 to a single 
+ * sample of input data. The output is multiplied by 2 to the negative power 
+ * of scalefactor (i.e., 2^-scalefactor) before returning to the user.  
+ * Scaling and rounding conventions are defined in section 1.6.7.  
+ * The internal accumulator width must be at least 32 bits.  
+ * The result is undefined if any of the partially accumulated values exceeds 
+ * the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pValResult - the pointer to a single input sample to which the filter is 
+ *            applied. 
+ *   pTapsQ15 - pointer to the vector that contains the filter coefficients, 
+ *            represented in Q0.15 format (as defined in section 1.6.5). Given 
+ *            that:
+ *                    -32768 = pTapsQ15(k) < 32768, 
+ *                         0 = k < tapsLen, 
+ *            the range on the actual filter coefficients is -1 = bK <1, and 
+ *            therefore coefficient normalization may be required during the 
+ *            filter design process. 
+ *   tapsLen - the number of taps, or, equivalently, the filter order + 1 
+ *   pDelayLine - pointer to the 2.tapsLen -element filter memory buffer 
+ *            (state). The user is responsible for allocation, initialization, 
+ *            and de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *   pDelayLineIndex - pointer to the filter memory index that is maintained 
+ *            internally by the function. The user should initialize the value 
+ *            of this index to zero. 
+ *   scaleFactor - saturation fixed scaleFactor 
+ *
+ * Output Arguments:
+ *   
+ *   pValResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    One or more of the following pointers is NULL: 
+ *            -  pValResult, 
+ *            -  pTapsQ15, 
+ *            -  pDelayLine, or 
+ *            -  pDelayLineIndex 
+ *    -    tapslen < 1 
+ *    -    scaleFactor < 0 
+ *    -    *pDelayLineIndex < 0 or *pDelayLineIndex >= (2 * tapslen) 
+ *
+ */
+OMXResult omxSP_FIROne_Direct_S16_ISfs (
+    OMX_S16 *pValResult,
+    const OMX_S16 *pTapsQ15,
+    OMX_INT tapsLen,
+    OMX_S16 *pDelayLine,
+    OMX_INT *pDelayLineIndex,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_IIR_Direct_S16   (2.2.3.2.1)
+ *
+ * Description:
+ * Block IIR filtering for 16-bit data. This function applies the direct form 
+ * II IIR filter defined by the coefficient vector pTaps to a vector of input 
+ * data.  The internal accumulator width must be at least 32 bits, and the 
+ * result is saturated if the operation produces a value outside the range of 
+ * a signed 16-bit integer, i.e., the output will saturate to 0x8000 (-32768) 
+ * for a negative overflow or 0x7fff (32767) for a positive overflow.  The 
+ * result is undefined if any of the partially accumulated values exceeds the 
+ * range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc  - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   len - the number of samples contained in both the input and output 
+ *            vectors 
+ *   pTaps - pointer to the 2L+2-element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the system 
+ *            transfer function, H(z). Coefficient scaling and coefficient 
+ *            vector organization should follow the conventions described 
+ *            above. The value of the coefficient scaleFactor exponent must be 
+ *            non-negative (sf=0). 
+ *   order - the maximum of the degrees of the numerator and denominator 
+ *            coefficient polynomials from the system transfer function, H(z). 
+ *            In the notation of section 2.2.3.2, the parameter 
+ *            order=max(K,M)=L gives the maximum delay, in samples, used to 
+ *            compute each output sample. 
+ *   pDelayLine - pointer to the L-element filter memory buffer (state). The 
+ *            user is responsible for allocation, initialization, and 
+ *            deallocation. The filter memory elements are initialized to zero 
+ *            in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL: 
+ *             -  pSrc, 
+ *             -  pDst, 
+ *             -  pTaps, or 
+ *             -  pDelayLine. 
+ *    -    len < 0 
+ *    -    pTaps[order+1] < 0 (negative scaling) 
+ *    -    order < 1 
+ *
+ */
+OMXResult omxSP_IIR_Direct_S16 (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_INT len,
+    const OMX_S16 *pTaps,
+    OMX_INT order,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_IIR_Direct_S16_I   (2.2.3.2.1)
+ *
+ * Description:
+ * Block IIR filtering for 16-bit data. This function applies the direct form 
+ * II IIR filter defined by the coefficient vector pTaps to a vector of input 
+ * data.  The internal accumulator width must be at least 32 bits, and the 
+ * result is saturated if the operation produces a value outside the range of 
+ * a signed 16-bit integer, i.e., the output will saturate to 0x8000 (-32768) 
+ * for a negative overflow or 0x7fff (32767) for a positive overflow.  The 
+ * result is undefined if any of the partially accumulated values exceeds the 
+ * range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   len - the number of samples contained in both the input and output 
+ *            vectors 
+ *   pTaps - pointer to the 2L+2-element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the system 
+ *            transfer function, H(z). Coefficient scaling and coefficient 
+ *            vector organization should follow the conventions described 
+ *            above. The value of the coefficient scaleFactor exponent must be 
+ *            non-negative (sf>=0). 
+ *   order - the maximum of the degrees of the numerator and denominator 
+ *            coefficient polynomials from the system transfer function, H(z). 
+ *            In the notation of section 2.2.3.2, the parameter 
+ *            order=max(K,M)=L gives the maximum delay, in samples, used to 
+ *            compute each output sample. 
+ *   pDelayLine - pointer to the L-element filter memory buffer (state). The 
+ *            user is responsible for allocation, initialization, and 
+ *            deallocation. The filter memory elements are initialized to zero 
+ *            in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL: 
+ *             -  pSrcDst, 
+ *             -  pTaps, or 
+ *             -  pDelayLine. 
+ *    -    len < 0 
+ *    -    pTaps[order+1] < 0 (negative scaling) 
+ *    -    order < 1 
+ *
+ */
+OMXResult omxSP_IIR_Direct_S16_I (
+    OMX_S16 *pSrcDst,
+    OMX_INT len,
+    const OMX_S16 *pTaps,
+    OMX_INT order,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_IIROne_Direct_S16   (2.2.3.2.2)
+ *
+ * Description:
+ * Single sample IIR filtering for 16-bit data.  This function applies the 
+ * direct form II IIR filter defined by the coefficient vector pTaps to a 
+ * single sample of input data. The internal accumulator width must be at 
+ * least 32 bits, and the result is saturated if the operation produces a 
+ * value outside the range of a signed 16-bit integer, i.e., the output will 
+ * saturate to 0x8000 (-32768) for a negative overflow or 0x7fff (32767) for a 
+ * positive overflow.  The result is undefined if any of the partially 
+ * accumulated values exceeds the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   val - the single input sample to which the filter is 
+ *            applied.  
+ *   pTaps - pointer to the 2L+2 -element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the system 
+ *            transfer function, H(z). Coefficient scaling and coefficient 
+ *            vector organization should follow the conventions described 
+ *            above. The value of the coefficient scaleFactor exponent must be 
+ *            non-negative (sf>=0). 
+ *   order - the maximum of the degrees of the numerator and denominator 
+ *            coefficient polynomials from the system transfer function, H(z). 
+ *            In the notation of section 2.2.3.2, the parameter 
+ *            order=max(K,M)=L gives the maximum delay, in samples, used to 
+ *            compute each output sample. 
+ *   pDelayLine - pointer to the L-element filter memory buffer (state). The 
+ *            user is responsible for allocation, initialization, and 
+ *            deallocation. The filter memory elements are initialized to zero 
+ *            in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL: pResult, 
+ *              pTaps, or pDelayLine. 
+ *    -    order < 1 
+ *    -    pTaps[order+1] < 0 (negative scaling) 
+ *
+ */
+OMXResult omxSP_IIROne_Direct_S16 (
+    OMX_S16 val,
+    OMX_S16 *pResult,
+    const OMX_S16 *pTaps,
+    OMX_INT order,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_IIROne_Direct_S16_I   (2.2.3.2.2)
+ *
+ * Description:
+ * Single sample IIR filtering for 16-bit data.  This function applies the 
+ * direct form II IIR filter defined by the coefficient vector pTaps to a 
+ * single sample of input data. The internal accumulator width must be at 
+ * least 32 bits, and the result is saturated if the operation produces a 
+ * value outside the range of a signed 16-bit integer, i.e., the output will 
+ * saturate to 0x8000 (-32768) for a negative overflow or 0x7fff (32767) for a 
+ * positive overflow.  The result is undefined if any of the partially 
+ * accumulated values exceeds the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pValResult - pointer to the single input sample to which the filter is 
+ *            applied.
+ *   pTaps - pointer to the 2L+2 -element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the system 
+ *            transfer function, H(z). Coefficient scaling and coefficient 
+ *            vector organization should follow the conventions described 
+ *            above. The value of the coefficient scaleFactor exponent must be 
+ *            non-negative (sf>=0). 
+ *   order - the maximum of the degrees of the numerator and denominator 
+ *            coefficient polynomials from the system transfer function, H(z). 
+ *            In the notation of section 2.2.3.2, the parameter 
+ *            order=max(K,M)=L gives the maximum delay, in samples, used to 
+ *            compute each output sample. 
+ *   pDelayLine - pointer to the L-element filter memory buffer (state). The 
+ *            user is responsible for allocation, initialization, and 
+ *            deallocation. The filter memory elements are initialized to zero 
+ *            in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pValResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL:  
+ *              pValResult, pTaps, or pDelayLine. 
+ *    -    order < 1 
+ *    -    pTaps[order+1] < 0 (negative scaling) 
+ *
+ */
+OMXResult omxSP_IIROne_Direct_S16_I (
+    OMX_S16 *pValResult,
+    const OMX_S16 *pTaps,
+    OMX_INT order,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_IIR_BiQuadDirect_S16   (2.2.3.3.1)
+ *
+ * Description:
+ * Block biquad IIR filtering for 16-bit data type. This function applies the 
+ * direct form II biquad IIR cascade defined by the coefficient vector pTaps 
+ * to a vector of input data.  The internal accumulator width must be at least 
+ * 32 bits, and the result is saturated if the operation produces a value 
+ * outside the range of a signed 16-bit integer, i.e., the output will 
+ * saturate to 0x8000 (-32768) for a negative overflow or 0x7fff (32767) for a 
+ * positive overflow.  The result is undefined if any of the partially 
+ * accumulated values exceeds the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   len - the number of samples contained in both the input and output 
+ *            vectors 
+ *   pTaps - pointer to the 6P -element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the biquad 
+ *            cascade. Coefficient scaling and coefficient vector organization 
+ *            should follow the conventions described above. The value of the 
+ *            coefficient scaleFactor exponent must be non-negative. (sfp>=0). 
+ *   numBiquad - the number of biquads contained in the IIR filter cascade: 
+ *            (P) 
+ *   pDelayLine - pointer to the 2P -element filter memory buffer (state). 
+ *            The user is responsible for allocation, initialization, and 
+ *            de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL: pSrc, pDst, 
+ *              pTaps, or pDelayLine. 
+ *    -    len < 0 
+ *    -    numBiquad < 1 
+ *    -    pTaps[3+n*6] < 0, for 0 <= n < numBiquad (negative scaling) 
+ *
+ */
+OMXResult omxSP_IIR_BiQuadDirect_S16 (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_INT len,
+    const OMX_S16 *pTaps,
+    OMX_INT numBiquad,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_IIR_BiQuadDirect_S16_I   (2.2.3.3.1)
+ *
+ * Description:
+ * Block biquad IIR filtering for 16-bit data type. This function applies the 
+ * direct form II biquad IIR cascade defined by the coefficient vector pTaps 
+ * to a vector of input data.  The internal accumulator width must be at least 
+ * 32 bits, and the result is saturated if the operation produces a value 
+ * outside the range of a signed 16-bit integer, i.e., the output will 
+ * saturate to 0x8000 (-32768) for a negative overflow or 0x7fff (32767) for a 
+ * positive overflow.  The result is undefined if any of the partially 
+ * accumulated values exceeds the range of a signed 32-bit integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of input samples to which the 
+ *            filter is applied 
+ *   len - the number of samples contained in both the input and output 
+ *            vectors 
+ *   pTaps - pointer to the 6P -element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the biquad 
+ *            cascade. Coefficient scaling and coefficient vector organization 
+ *            should follow the conventions described above. The value of the 
+ *            coefficient scaleFactor exponent must be non-negative. (sfp>=0). 
+ *   numBiquad - the number of biquads contained in the IIR filter cascade: 
+ *            (P) 
+ *   pDelayLine - pointer to the 2P -element filter memory buffer (state). 
+ *            The user is responsible for allocation, initialization, and 
+ *            de-allocation. The filter memory elements are initialized to 
+ *            zero in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the vector of filtered output samples 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL: 
+ *              pSrcDst, pTaps, or pDelayLine. 
+ *    -    len < 0 
+ *    -    numBiquad < 1 
+ *    -    pTaps[3+n*6] < 0, for 0 <= n < numBiquad (negative scaling) 
+ *
+ */
+OMXResult omxSP_IIR_BiQuadDirect_S16_I (
+    OMX_S16 *pSrcDst,
+    OMX_INT len,
+    const OMX_S16 *pTaps,
+    OMX_INT numBiquad,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_IIROne_BiQuadDirect_S16   (2.2.3.3.2)
+ *
+ * Description:
+ * Single-sample biquad IIR filtering for 16-bit data type. This function 
+ * applies the direct form II biquad IIR cascade defined by the coefficient 
+ * vector pTaps to a single sample of input data.  The internal accumulator 
+ * width must be at least 32 bits, and the result is saturated if the 
+ * operation produces a value outside the range of a signed 16-bit integer, 
+ * i.e., the output will saturate to 0x8000 (-32768) for a negative overflow 
+ * or 0x7fff (32767) for a positive overflow.  The result is undefined if any 
+ * of the partially accumulated values exceeds the range of a signed 32-bit 
+ * integer. 
+ *
+ * Input Arguments:
+ *   
+ *   val   - the single input sample to which the filter is 
+ *            applied. 
+ *   pTaps - pointer to the 6P-element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the biquad 
+ *            cascade. Coefficient scaling and coefficient vector organization 
+ *            should follow the conventions described above. The value of the 
+ *            coefficient scalefactor exponent must be non-negative: (sfp>=0). 
+ *   numBiquad - the number of biquads contained in the IIR filter cascade: 
+ *            (P) 
+ *   pDelayLine - pointer to the 2p-element filter memory buffer (state). The 
+ *            user is responsible for allocation, initialization, and 
+ *            deallocation. The filter memory elements are initialized to zero 
+ *            in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL: pResult, 
+ *              pValResult, pTaps, or pDelayLine. 
+ *    -    numBiquad < 1 
+ *    -    pTaps[3+n*6] < 0, for 0 <= n < numBiquad (negative scaling) 
+ *
+ */
+OMXResult omxSP_IIROne_BiQuadDirect_S16 (
+    OMX_S16 val,
+    OMX_S16 *pResult,
+    const OMX_S16 *pTaps,
+    OMX_INT numBiquad,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_IIROne_BiQuadDirect_S16_I   (2.2.3.3.2)
+ *
+ * Description:
+ * Single-sample biquad IIR filtering for 16-bit data type. This function 
+ * applies the direct form II biquad IIR cascade defined by the coefficient 
+ * vector pTaps to a single sample of input data.  The internal accumulator 
+ * width must be at least 32 bits, and the result is saturated if the 
+ * operation produces a value outside the range of a signed 16-bit integer, 
+ * i.e., the output will saturate to 0x8000 (-32768) for a negative overflow 
+ * or 0x7fff (32767) for a positive overflow.  The result is undefined if any 
+ * of the partially accumulated values exceeds the range of a signed 32-bit 
+ * integer. 
+ *
+ * Input Arguments:
+ *   
+ *   pValResult - pointer to the single input sample to which the filter is 
+ *            applied. 
+ *   pTaps - pointer to the 6P-element vector that contains the combined 
+ *            numerator and denominator filter coefficients from the biquad 
+ *            cascade. Coefficient scaling and coefficient vector organization 
+ *            should follow the conventions described above. The value of the 
+ *            coefficient scalefactor exponent must be non-negative: (sfp>=0). 
+ *   numBiquad - the number of biquads contained in the IIR filter cascade: 
+ *            (P) 
+ *   pDelayLine - pointer to the 2p-element filter memory buffer (state). The 
+ *            user is responsible for allocation, initialization, and 
+ *            deallocation. The filter memory elements are initialized to zero 
+ *            in most applications. 
+ *
+ * Output Arguments:
+ *   
+ *   pValResult - pointer to the filtered output sample 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    one or more of the following pointers is NULL:
+ *              pValResult, pTaps, or pDelayLine. 
+ *    -    numBiquad < 1 
+ *    -    pTaps[3+n*6] < 0, for 0 <= n < numBiquad (negative scaling) 
+ *
+ */
+OMXResult omxSP_IIROne_BiQuadDirect_S16_I (
+    OMX_S16 *pValResult,
+    const OMX_S16 *pTaps,
+    OMX_INT numBiquad,
+    OMX_S32 *pDelayLine
+);
+
+
+
+/**
+ * Function:  omxSP_FilterMedian_S32   (2.2.3.4.1)
+ *
+ * Description:
+ * This function computes the median over the region specified by the median 
+ * mask for the every element of the input array. The median outputs are 
+ * stored in the corresponding elements of the output vector. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the input vector 
+ *   len - number of elements contained in the input and output vectors (0 < 
+ *            len < 65536) 
+ *   maskSize - median mask size; if an even value is specified, the function 
+ *            subtracts 1 and uses the odd value of the filter mask for median 
+ *            filtering (0 < maskSize < 256) 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the median-filtered output vector 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst. 
+ *    -    len < 0 
+ *    -    maskSize < 1 or maskSize> 255 
+ *    OMX_StsSP_EvenMedianMaskSizeErr - even mask size replaced by odd mask 
+ *              size 
+ *
+ */
+OMXResult omxSP_FilterMedian_S32 (
+    const OMX_S32 *pSrc,
+    OMX_S32 *pDst,
+    OMX_INT len,
+    OMX_INT maskSize
+);
+
+
+
+/**
+ * Function:  omxSP_FilterMedian_S32_I   (2.2.3.4.1)
+ *
+ * Description:
+ * This function computes the median over the region specified by the median 
+ * mask for the every element of the input array. The median outputs are 
+ * stored in the corresponding elements of the output vector. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input vector 
+ *   len - number of elements contained in the input and output vectors (0 < 
+ *            len < 65536) 
+ *   maskSize - median mask size; if an even value is specified, the function 
+ *            subtracts 1 and uses the odd value of the filter mask for median 
+ *            filtering (0 < maskSize < 256) 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the median-filtered output vector 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    pSrcDst is NULL. 
+ *    -    len < 0 
+ *    -    maskSize < 1 or maskSize> 255 
+ *    OMX_StsSP_EvenMedianMaskSizeErr - even mask size replaced by odd mask 
+ *              size 
+ *
+ */
+OMXResult omxSP_FilterMedian_S32_I (
+    OMX_S32 *pSrcDst,
+    OMX_INT len,
+    OMX_INT maskSize
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInit_C_SC16   (2.2.4.1.2)
+ *
+ * Description:
+ * These functions initialize the specification structures required for the 
+ * complex FFT and IFFT functions. Desired block length is specified as an 
+ * input. The function <FFTInit_C_SC16> is used to initialize the 
+ * specification structures for functions <FFTFwd_CToC_SC16_Sfs> and 
+ * <FFTInv_CToC_SC16_Sfs>.
+ *
+ * Memory for the specification structure *pFFTSpec 
+ * must be allocated prior to calling these functions and should be 4-byte 
+ * aligned for omxSP_FFTInit_C_SC16. 
+ *
+ * The space required for *pFFTSpec, in bytes, can be 
+ * determined using <FFTGetBufSize_C_SC16>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; 
+ *           valid in the range [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pFFTSpec - pointer to initialized specification structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr -no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   pFFTSpec is either NULL or violates the 4-byte alignment 
+ *              restrictions 
+ *    -   order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTInit_C_SC16 (
+    OMXFFTSpec_C_SC16 *pFFTSpec,
+    OMX_INT order
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInit_C_SC32   (2.2.4.1.2)
+ *
+ * Description:
+ * These functions initialize the specification structures required for the 
+ * complex FFT and IFFT functions. Desired block length is specified as an 
+ * input. The function <FFTInit_C_SC32> is used to initialize 
+ * the specification structures for the functions <FFTFwd_CToC_SC32_Sfs> and 
+ * <FFTInv_CToC_SC32_Sfs>.
+ *
+ * Memory for the specification structure *pFFTSpec must be allocated prior 
+ * to calling these functions and should be 8-byte aligned for 
+ * omxSP_FFTInit_C_SC32. 
+ *
+ * The space required for *pFFTSpec, in bytes, can be 
+ * determined using <FFTGetBufSize_C_SC32>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pFFTSpec - pointer to initialized specification structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr -no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   pFFTSpec is either NULL or violates the 8-byte alignment 
+ *              restrictions 
+ *    -   order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTInit_C_SC32 (
+    OMXFFTSpec_C_SC32 *pFFTSpec,
+    OMX_INT order
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInit_R_S16S32   (2.2.4.1.4)
+ *
+ * Description:
+ * These functions initialize specification structures required for the real 
+ * FFT and IFFT functions. The function <FFTInit_R_S16S32> is used to 
+ * initialize the specification structures for functions 
+ * <FFTFwd_RToCCS_S16S32_Sfs> and <FFTInv_CCSToR_S32S16_Sfs>.
+ * 
+ * Memory for 
+ * *pFFTFwdSpec must be allocated before calling these functions and should be 
+ * 8-byte aligned. The number of bytes required for *pFFTFwdSpec can be 
+ * determined using <FFTGetBufSize_R_S16S32>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pFFTFwdSpec - pointer to the initialized specification structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   pFFTFwdSpec is either NULL or violates the 8-byte alignment 
+ *              restrictions 
+ *    -   order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTInit_R_S16S32 (
+    OMXFFTSpec_R_S16S32*pFFTFwdSpec,
+    OMX_INT order
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInit_R_S32   (2.2.4.1.4)
+ *
+ * Description:
+ * These functions initialize specification structures required for the real 
+ * FFT and IFFT functions. The function <FFTInit_R_S32> is used to initialize 
+ * the specification structures for functions <FFTFwd_RToCCS_S32_Sfs> 
+ * and <FFTInv_CCSToR_S32_Sfs>. 
+ *
+ * Memory for *pFFTFwdSpec must be allocated before calling these functions
+ * and should be 8-byte aligned. 
+ *
+ * The number of bytes required for *pFFTFwdSpec can be 
+ * determined using <FFTGetBufSize_R_S32>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pFFTFwdSpec - pointer to the initialized specification structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   pFFTFwdSpec is either NULL or violates the 8-byte alignment 
+ *              restrictions 
+ *    -   order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTInit_R_S32 (
+    OMXFFTSpec_R_S32*pFFTFwdSpec,
+    OMX_INT order
+);
+
+
+
+/**
+ * Function:  omxSP_FFTGetBufSize_C_SC16   (2.2.4.1.6)
+ *
+ * Description:
+ * These functions compute the size of the specification structure 
+ * required for the length 2^order complex FFT and IFFT functions. The function 
+ * <FFTGetBufSize_C_SC16> is used in conjunction with the 16-bit functions 
+ * <FFTFwd_CToC_SC16_Sfs> and <FFTInv_CToC_SC16_Sfs>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    pSize is NULL 
+ *    -    order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTGetBufSize_C_SC16 (
+    OMX_INT order,
+    OMX_INT *pSize
+);
+
+
+
+/**
+ * Function:  omxSP_FFTGetBufSize_C_SC32   (2.2.4.1.6)
+ *
+ * Description:
+ * These functions compute the size of the specification structure 
+ * required for the length 2^order complex FFT and IFFT functions. The function 
+ * <FFTGetBufSize_C_SC32> is used in conjunction with the 32-bit functions 
+ * <FFTFwd_CToC_SC32_Sfs> and <FFTInv_CToC_SC32_Sfs>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -    pSize is NULL 
+ *    -    order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTGetBufSize_C_SC32 (
+    OMX_INT order,
+    OMX_INT *pSize
+);
+
+
+
+/**
+ * Function:  omxSP_FFTGetBufSize_R_S16S32   (2.2.4.1.8)
+ *
+ * Description:
+ * order These functions compute the size of the specification structure 
+ * required for the length 2^order real FFT and IFFT functions. The function 
+ * <FFTGetBufSize_R_S16S32> is used in conjunction with the 16-bit functions 
+ * <FFTFwd_RToCCS_S16S32_Sfs> and <FFTInv_CCSToR_S32S16_Sfs>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the length; valid in the range [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments The function returns 
+ *              OMX_Sts_BadArgErr if one or more of the following is true: 
+ *    pSize is NULL 
+ *    order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTGetBufSize_R_S16S32 (
+    OMX_INT order,
+    OMX_INT *pSize
+);
+
+
+
+/**
+ * Function:  omxSP_FFTGetBufSize_R_S32   (2.2.4.1.8)
+ *
+ * Description:
+ * These functions compute the size of the specification structure 
+ * required for the length 2^order real FFT and IFFT functions.  The function 
+ * <FFTGetBufSize_R_S32> is used in conjunction with the 32-bit functions 
+ * <FFTFwd_RToCCS_S32_Sfs> and <FFTInv_CCSToR_S32_Sfs>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the length; valid in the range [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments The function returns 
+ *              OMX_Sts_BadArgErr if one or more of the following is true: 
+ *    pSize is NULL 
+ *    order < 0 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTGetBufSize_R_S32 (
+    OMX_INT order,
+    OMX_INT *pSize
+);
+
+
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_SC16_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order, 
+ * where 0 <= order <= 12. 
+ * Transform length is determined by the specification structure, which 
+ * must be initialized prior to calling the FFT function using the appropriate 
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship 
+ * between the input and output sequences can be expressed in terms of the 
+ * DFT, i.e., 
+ *
+ *      X[k] = 2^(-scaleFactor) . SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length 2^order; 
+ *            must be aligned on a 32 byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - output scale factor; the range for is [0,16] 
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order; 
+ *          must be aligned on an 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions 
+ *              is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or 
+ *              pFFTSpec. 
+ *    -    pSrc or pDst is not 32-byte aligned 
+ *    -    scaleFactor<0 or scaleFactor>16
+ *
+ */
+OMXResult omxSP_FFTFwd_CToC_SC16_Sfs (
+    const OMX_SC16 *pSrc,
+    OMX_SC16 *pDst,
+    const OMXFFTSpec_C_SC16 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_SC32_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order, 
+ * where 0 <= order <= 12. 
+ * Transform length is determined by the specification structure, which 
+ * must be initialized prior to calling the FFT function using the appropriate 
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship 
+ * between the input and output sequences can be expressed in terms of the 
+ * DFT, i.e., 
+ *
+ *      X[k] = 2^(-scaleFactor) . SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length 2^order; 
+ *            must be aligned on a 32 byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - output scale factor; the range is [0,32] 
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order; must be 
+ *            aligned on an 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions 
+ *              is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or 
+ *              pFFTSpec. 
+ *    -    pSrc or pDst is not 32-byte aligned 
+ *    -    scaleFactor<0 or scaleFactor >32 
+ *
+ */
+OMXResult omxSP_FFTFwd_CToC_SC32_Sfs (
+    const OMX_SC32 *pSrc,
+    OMX_SC32 *pDst,
+    const OMXFFTSpec_C_SC32 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInv_CToC_SC16_Sfs   (2.2.4.2.4)
+ *
+ * Description:
+ * These functions compute an inverse FFT for a complex signal of  length
+ * of 2^order, where 0 <= order <= 12. Transform length is determined by the 
+ * specification structure, which must be initialized prior to calling the FFT 
+ * function using the appropriate helper, i.e., <FFTInit_C_sc32> or 
+ * <FFTInit_C_SC16>. The relationship between the input and output sequences 
+ * can be expressed in terms of the IDFT, i.e.: 
+ *
+ *     x[n] = (2^(-scalefactor)/N)  . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input signal, of length 2^order ; 
+ *          must be aligned on a 32-byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - scale factor of the output. Valid range is [0,16]. 
+ *
+ * Output Arguments:
+ *   order 
+ *   pDst - pointer to the complex-valued output signal, of length 2^order; 
+ *          must be aligned on a 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions 
+ *              is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or 
+ *              pFFTSpec. 
+ *    -   pSrc or pDst is not 32-byte aligned 
+ *    -   scaleFactor<0 or scaleFactor>16 
+ *
+ */
+OMXResult omxSP_FFTInv_CToC_SC16_Sfs (
+    const OMX_SC16 *pSrc,
+    OMX_SC16 *pDst,
+    const OMXFFTSpec_C_SC16 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInv_CToC_SC32_Sfs   (2.2.4.2.4)
+ *
+ * Description:
+ * These functions compute an inverse FFT for a complex signal of length
+ * of 2^order, where 0 <= order <= 12. Transform length is determined by the 
+ * specification structure, which must be initialized prior to calling the FFT 
+ * function using the appropriate helper, i.e., <FFTInit_C_sc32> or 
+ * <FFTInit_C_SC16>. The relationship between the input and output sequences 
+ * can be expressed in terms of the IDFT, i.e.: 
+ *
+ *     x[n] = (2^(-scalefactor)/N)  . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input signal, of length 2^order ; 
+ *          must be aligned on a 32-byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - scale factor of the output. Valid range is [0,32]. 
+ *
+ * Output Arguments:
+ *   order 
+ *   pDst - pointer to the complex-valued output signal, of length 2^order; 
+ *          must be aligned on a 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions 
+ *              is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or 
+ *              pFFTSpec. 
+ *    -   pSrc or pDst is not 32-byte aligned 
+ *    -   scaleFactor<0 or scaleFactor>32
+ *
+ */
+OMXResult omxSP_FFTInv_CToC_SC32_Sfs (
+    const OMX_SC32 *pSrc,
+    OMX_SC32 *pDst,
+    const OMXFFTSpec_C_SC32 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FFTFwd_RToCCS_S16S32_Sfs   (2.2.4.4.2)
+ *
+ * Description:
+ * These functions compute an FFT for a real-valued signal of length of 2^order,
+ * where 0 <= order <= 12. Transform length is determined by the 
+ * specification structure, which must be initialized prior to calling the FFT 
+ * function using the appropriate helper, i.e., <FFTInit_R_S16S32>. 
+ * The relationship between the input and output sequences 
+ * can be expressed in terms of the DFT, i.e.:
+ *
+ *     x[n] = (2^(-scalefactor)/N)  . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * The conjugate-symmetric output sequence is represented using a CCS vector, 
+ * which is of length N+2, and is organized as follows: 
+ *
+ *   Index:      0  1  2  3  4  5   . . .   N-2       N-1       N       N+1 
+ *   Component:  R0 0  R1 I1 R2 I2  . . .   R[N/2-1]  I[N/2-1]  R[N/2]  0 
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components 
+ * for FFT bin 'n'. Bins  are numbered from 0 to N/2, where N is the FFT length. 
+ * Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to the 
+ * foldover frequency. 
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the real-valued input sequence, of length 2^order; 
+ *          must be aligned on a 32-byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - output scale factor; valid range is [0, 32] 
+ *
+ * Output Arguments:
+ *   pDst - pointer to output sequence, represented using CCS format, of 
+ *            length (2^order)+2; must be aligned on a 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments, if one or more of the following is true: 
+ *    -    one of the pointers pSrc, pDst, or pFFTSpec is NULL 
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary 
+ *    -    scaleFactor<0 or scaleFactor >32 
+ *
+ */
+OMXResult omxSP_FFTFwd_RToCCS_S16S32_Sfs (
+    const OMX_S16 *pSrc,
+    OMX_S32 *pDst,
+    const OMXFFTSpec_R_S16S32 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FFTFwd_RToCCS_S32_Sfs   (2.2.4.4.2)
+ *
+ * Description:
+ * These functions compute an FFT for a real-valued signal of length of 2^order,
+ * where 0 <= order <= 12. Transform length is determined by the 
+ * specification structure, which must be initialized prior to calling the FFT 
+ * function using the appropriate helper, i.e., <FFTInit_R_S32>. 
+ * The relationship between the input and output sequences 
+ * can be expressed in terms of the DFT, i.e.:
+ *
+ *     x[n] = (2^(-scalefactor)/N)  . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * The conjugate-symmetric output sequence is represented using a CCS vector, 
+ * which is of length N+2, and is organized as follows: 
+ *
+ *   Index:      0  1  2  3  4  5   . . .   N-2       N-1       N       N+1 
+ *   Component:  R0 0  R1 I1 R2 I2  . . .   R[N/2-1]  I[N/2-1]  R[N/2]  0 
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components 
+ * for FFT bin 'n'. Bins  are numbered from 0 to N/2, where N is the FFT length. 
+ * Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to the 
+ * foldover frequency. 
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the real-valued input sequence, of length 2^order; 
+ *          must be aligned on a 32-byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - output scale factor; valid range is [0, 32] 
+ *
+ * Output Arguments:
+ *   pDst - pointer to output sequence, represented using CCS format, of 
+ *            length (2^order)+2; must be aligned on a 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments, if one or more of the following is true: 
+ *    -    one of the pointers pSrc, pDst, or pFFTSpec is NULL 
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary 
+ *    -    scaleFactor<0 or scaleFactor >32 
+ *
+ */
+OMXResult omxSP_FFTFwd_RToCCS_S32_Sfs (
+    const OMX_S32 *pSrc,
+    OMX_S32 *pDst,
+    const OMXFFTSpec_R_S32 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInv_CCSToR_S32S16_Sfs   (2.2.4.4.4)
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input 
+ * sequence.  Transform length is determined by the specification structure, 
+ * which must be initialized prior to calling the FFT function using 
+ * <FFTInit_R_S16S32>. For a transform of length M, the input sequence is 
+ * represented using a packed CCS vector of length M+2, and is organized 
+ * as follows: 
+ *
+ *   Index:     0    1  2    3    4    5    . . .  M-2       M-1      M      M+1 
+ *   Component  R[0] 0  R[1] I[1] R[2] I[2] . . .  R[M/2-1]  I[M/2-1] R[M/2] 0 
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components for FFT bin n. 
+ * Bins are numbered from 0 to M/2, where M is the FFT length.  Bin index 0 
+ * corresponds to the DC component, and bin index M/2 corresponds to the 
+ * foldover frequency. 
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input sequence represented using 
+ *            CCS format, of length (2^order) + 2; must be aligned on a 32-byte 
+ *            boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - output scalefactor; range is [0,16]
+ *
+ * Output Arguments:
+ *   pDst - pointer to the real-valued output sequence, of length 2^order ; must be 
+ *            aligned on a 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments if one or more of the following is true: 
+ *    -    pSrc, pDst, or pFFTSpec is NULL 
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary 
+ *    -    scaleFactor<0 or scaleFactor >16
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_S32S16_Sfs (
+    const OMX_S32 *pSrc,
+    OMX_S16 *pDst,
+    const OMXFFTSpec_R_S16S32 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+/**
+ * Function:  omxSP_FFTInv_CCSToR_S32_Sfs   (2.2.4.4.4)
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input 
+ * sequence.  Transform length is determined by the specification structure, 
+ * which must be initialized prior to calling the FFT function using 
+ * <FFTInit_R_S32>. For a transform of length M, the input sequence is 
+ * represented using a packed CCS vector of length M+2, and is organized 
+ * as follows: 
+ *
+ *   Index:     0    1  2    3    4    5    . . .  M-2       M-1      M      M+1 
+ *   Component  R[0] 0  R[1] I[1] R[2] I[2] . . .  R[M/2-1]  I[M/2-1] R[M/2] 0 
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components for FFT bin n. 
+ * Bins are numbered from 0 to M/2, where M is the FFT length.  Bin index 0 
+ * corresponds to the DC component, and bin index M/2 corresponds to the 
+ * foldover frequency. 
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input sequence represented using 
+ *            CCS format, of length (2^order) + 2; must be aligned on a 32-byte 
+ *            boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *   scaleFactor - output scalefactor; range is [0,32] 
+ *
+ * Output Arguments:
+ *   pDst - pointer to the real-valued output sequence, of length 2^order ; must be 
+ *            aligned on a 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments if one or more of the following is true: 
+ *    -    pSrc, pDst, or pFFTSpec is NULL 
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary 
+ *    -    scaleFactor<0 or scaleFactor >32
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_S32_Sfs (
+    const OMX_S32 *pSrc,
+    OMX_S32 *pDst,
+    const OMXFFTSpec_R_S32 *pFFTSpec,
+    OMX_INT scaleFactor
+);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /** end of #define _OMXSP_H_ */
+
+/** EOF */
+
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,294 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of
+@//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+@//  instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+      @// Guarding implementation by the processor name
+
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to complete the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r8
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@// Neon registers
+
+#define dX0     D0.F32
+#define dShift  D1.F32
+#define dX1     D1.F32
+#define dY0     D2.F32
+#define dY1     D3.F32
+#define dX0r    D0.F32
+#define dX0i    D1.F32
+#define dX1r    D2.F32
+#define dX1i    D3.F32
+#define dW0r    D4.F32
+#define dW0i    D5.F32
+#define dW1r    D6.F32
+#define dW1i    D7.F32
+#define dT0     D8.F32
+#define dT1     D9.F32
+#define dT2     D10.F32
+#define dT3     D11.F32
+#define qT0     D12.F32
+#define qT1     D14.F32
+#define qT2     D16.F32
+#define qT3     D18.F32
+#define dY0r    D4.F32
+#define dY0i    D5.F32
+#define dY1r    D6.F32
+#define dY1i    D7.F32
+
+#define dY2     D4.F32
+#define dY3     D5.F32
+#define dW0     D6.F32
+#define dW1     D7.F32
+#define dW0Tmp  D10.F32
+#define dW1Neg  D11.F32
+
+#define half    D13.F32
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        VMOV    half, 0.5
+
+
+        MOV     size,N,ASR #1                 @// preserve the contents of N
+        MOV     step,N,LSL #2                 @// step = N/2 * 8 bytes
+
+
+        @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        @// Note: W^(k) is stored as negated value and also need to
+        @// conjugate the values from the table
+
+        @// Z(0) : no need of twiddle multiply
+        @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+
+        VLD1    dX0,[pSrc],step
+        ADD     pOut1,pOut,step               @// pOut1 = pOut+ N/2*8 bytes
+
+        VLD1    dX1,[pSrc]!
+        @// twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,size,LSL #1
+
+        MOV     step1,size,LSL #2             @// step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                @// (N/4-1)*8 bytes
+
+        VADD    dY0,dX0,dX1                   @// [b+d | a+c]
+        VSUB    dY1,dX0,dX1                   @// [b-d | a-c]
+        VMUL    dY0, dY0, half[0]
+        VMUL    dY1, dY1, half[0]
+
+        @// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+        VZIP    dY0,dY1
+
+        VSUB   dX0,dY0,dY1
+        SUBS   size,size,#2
+        VADD   dX1,dY0,dY1
+
+        SUB     pSrc,pSrc,step
+
+        VST1    dX0[0],[pOut1]!
+        ADD     pTwiddleTmp,pTwiddle,#8       @// W^2
+        VST1    dX1[1],[pOut1]!
+        ADD     argTwiddle1,pTwiddle,twStep   @// W^1
+
+
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+
+
+        @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        @// Note: W^k is stored as negative values in the table and also
+        @// need to conjugate the values from the table.
+        @//
+        @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        @// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+        SUB     step,step,#24
+evenOddButterflyLoop\name :
+
+
+        VLD1    dW0r,[argTwiddle1],step1
+        VLD1    dW1r,[argTwiddle1]!
+
+        VLD2    {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        VLD2    {dX1r,dX1i},[pSrc]!
+
+        SUB     step1,step1,#8                @// (N/4-2)*8 bytes
+        VLD1    dW0i,[pTwiddleTmp],step1
+        VLD1    dW1i,[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV64  dX1r,dX1r
+        VREV64  dX1i,dX1i
+        SUBS    size,size,#4
+
+
+        VSUB    dT2,dX0r,dX1r                 @// a-c
+        VADD    dT3,dX0i,dX1i                 @// b+d
+        VADD    dT0,dX0r,dX1r                 @// a+c
+        VSUB    dT1,dX0i,dX1i                 @// b-d
+        SUB     step1,step1,#8
+
+        VMUL    dT2, dT2, half[0]
+        VMUL    dT3, dT3, half[0]
+
+        VMUL    dT0, dT0, half[0]
+        VMUL    dT1, dT1, half[0]
+
+        VZIP    dW1r,dW1i
+        VZIP    dW0r,dW0i
+
+
+        VMUL   dX1r,dW1r,dT2
+        VMUL   dX1i,dW1r,dT3
+        VMUL   dX0r,dW0r,dT2
+        VMUL   dX0i,dW0r,dT3
+
+        VMLS   dX1r,dW1i,dT3
+        VMLA   dX1i,dW1i,dT2
+
+        VMLA   dX0r,dW0i,dT3
+        VMLS   dX0i,dW0i,dT2
+
+
+        VADD    dY1r,dT0,dX1i                 @// F(N/2 -1)
+        VSUB    dY1i,dX1r,dT1
+
+        VREV64  dY1r,dY1r
+        VREV64  dY1i,dY1i
+
+
+        VADD    dY0r,dT0,dX0i                 @// F(1)
+        VSUB    dY0i,dT1,dX0r
+
+
+        VST2    {dY0r,dY0i},[pOut1],step
+        VST2    {dY1r,dY1i},[pOut1]!
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                 @// (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop\name
+
+
+        @// set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     pOut1,pOut1,#8
+
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+        @// -ve)
+        @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] - j (c-jd) [0+j2b]
+        @// (a+bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+        VLD1    dX0r,[pSrc]
+
+        VST1    dX0r[0],[pOut1]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[pOut1]
+
+
+
+decrementScale\name :
+
+        .endm
+
+        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe,r4
+
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+
+        .end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,321 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7485
+@// Last Modified Date:       Fri, 21 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT  
+@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+@// It implements both "scaled"(by 1/2) and "unsclaed" versions of the above formula
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+                
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+      @// Guarding implementation by the processor name
+    
+    
+    
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7     
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+#define count           r8                   @// Total num of radix stages required to comple the FFT
+#define x0r             r4    
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r8            
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@// Neon registers
+
+#define dX0     D0.S32
+#define dShift  D1.S32
+#define dX1     D1.S32
+#define dY0     D2.S32
+#define dY1     D3.S32
+#define dX0r    D0.S32            
+#define dX0i    D1.S32
+#define dX1r    D2.S32
+#define dX1i    D3.S32
+#define dW0r    D4.S32
+#define dW0i    D5.S32
+#define dW1r    D6.S32
+#define dW1i    D7.S32
+#define dT0     D8.S32
+#define dT1     D9.S32
+#define dT2     D10.S32
+#define dT3     D11.S32
+#define qT0     Q6.S64
+#define qT1     Q7.S64
+#define qT2     Q8.S64
+#define qT3     Q9.S64
+#define dY0r    D4.S32
+#define dY0i    D5.S32
+#define dY1r    D6.S32
+#define dY1i    D7.S32
+
+#define dY2     D4.S32
+#define dY3     D5.S32
+#define dW0     D6.S32
+#define dW1     D7.S32
+#define dW0Tmp  D10.S32
+#define dW1Neg  D11.S32
+
+
+@ Structure offsets for the FFTSpec             
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+        
+        
+        
+        MOV     size,N,ASR #1                    @// preserve the contents of N
+        MOV     step,N,LSL #2                    @// step = N/2 * 8 bytes
+        
+        
+        @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        @// Note: W^(k) is stored as negated value and also need to conjugate the values from the table
+        
+        @// Z(0) : no need of twiddle multiply
+        @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+        
+        VLD1    dX0,[pSrc],step
+        ADD     pOut1,pOut,step                  @// pOut1 = pOut+ N/2*8 bytes 
+                
+        VLD1    dX1,[pSrc]!
+        SUB     twStep,step,size,LSL #1          @// twStep = 3N/8 * 8 bytes pointing to W^1
+        
+        MOV     step1,size,LSL #2                @// step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                   @// (N/4-1)*8 bytes
+        
+        VHADD    dY0,dX0,dX1                     @// [b+d | a+c]
+        VHSUB    dY1,dX0,dX1                     @// [b-d | a-c] 
+        VZIP    dY0,dY1                          @// dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VHADD   dX1,dY0,dY1
+        .else
+            VSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VADD   dX1,dY0,dY1
+        .endif
+                    
+        SUB     pSrc,pSrc,step
+        
+        VST1    dX0[0],[pOut1]!
+        ADD     pTwiddleTmp,pTwiddle,#8                @// W^2
+        VST1    dX1[1],[pOut1]!
+        ADD     argTwiddle1,pTwiddle,twStep            @// W^1 
+        
+        
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+        
+                        
+        @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        @// Note: W^k is stored as negative values in the table and also need to conjugate the values from the table
+        @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) since both of them
+        @// require F(1),F(2) and F(N/2-2),F(N/2-1)
+        
+        
+        SUB     step,step,#24
+evenOddButterflyLoop\name :     
+        
+        
+        VLD1    dW0r,[argTwiddle1],step1
+        VLD1    dW1r,[argTwiddle1]!
+        
+        VLD2    {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        VLD2    {dX1r,dX1i},[pSrc]!
+        
+        SUB     step1,step1,#8                          @// (N/4-2)*8 bytes
+        VLD1    dW0i,[pTwiddleTmp],step1
+        VLD1    dW1i,[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+        
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV64  dX1r,dX1r
+        VREV64  dX1i,dX1i
+        SUBS    size,size,#4
+        
+                        
+        VHSUB    dT2,dX0r,dX1r                            @// a-c
+        VHADD    dT3,dX0i,dX1i                            @// b+d
+        SUB     step1,step1,#8
+        VHADD    dT0,dX0r,dX1r                           @// a+c
+        VHSUB    dT1,dX0i,dX1i                            @// b-d
+        
+        VZIP    dW1r,dW1i
+        VZIP    dW0r,dW0i
+        
+                                
+        VMULL   qT0,dW1r,dT2
+        VMLSL   qT0,dW1i,dT3
+        VMULL   qT1,dW1r,dT3
+        VMLAL   qT1,dW1i,dT2
+                    
+        VMULL   qT2,dW0r,dT2
+        VMLAL   qT2,dW0i,dT3
+        VMULL   qT3,dW0r,dT3
+        VMLSL   qT3,dW0i,dT2
+        
+        
+        VRSHRN  dX1r,qT0,#31
+        VRSHRN  dX1i,qT1,#31
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
+            VHSUB    dY1i,dX1r,dT1
+        .else
+            VADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
+            VSUB    dY1i,dX1r,dT1
+
+        .endif
+        
+        
+        VREV64  dY1r,dY1r
+        VREV64  dY1i,dY1i
+        
+                            
+        VRSHRN  dX0r,qT2,#31
+        VRSHRN  dX0i,qT3,#31
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY0r,dT0,dX0i                           @// F(1)
+            VHSUB    dY0i,dT1,dX0r
+        .else
+            VADD    dY0r,dT0,dX0i                           @// F(1)
+            VSUB    dY0i,dT1,dX0r
+        .endif
+        
+        
+        VST2    {dY0r,dY0i},[pOut1],step
+        VST2    {dY1r,dY1i},[pOut1]!
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                            @// (N/2-4)*8 bytes
+        
+        
+        BGT     evenOddButterflyLoop\name
+        
+        
+        SUB     pSrc,pSrc,#8                @// set both the ptrs to the last element
+        SUB     pOut1,pOut1,#8
+        
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as -ve)
+        @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] - j (c-jd) [0+j2b]
+        @// (a+bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+        
+lastElement\name :      
+        VLD1    dX0r,[pSrc]
+        
+        .ifeqs  "\scaled", "TRUE"
+            VSHR    dX0r,dX0r,#1
+        .endif
+        
+        VST1    dX0r[0],[pOut1]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[pOut1]
+        
+        
+
+decrementScale\name :          
+        
+        .ifeqs  "\scaled", "TRUE"
+            SUB scale,scale,#1
+        .endif
+        
+        .endm
+        
+        M_START armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe,r4
+                    
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+        
+        M_START armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe,r4
+                    
+            FFTSTAGE "TRUE","TRUE",InvSfs
+        M_END
+
+        
+        .end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
@@ -0,0 +1,134 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT
+@// stage for a N point complex signal.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define pPingPongBuf    r5
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep       r3
+#define outPointStep    r3
+#define grpSize         r4
+#define setCount        r4
+#define step            r8
+#define dstStep         r8
+
+@// Neon Registers
+
+#define dX0     D0.F32
+#define dX1     D1.F32
+#define dY0     D2.F32
+#define dY1     D3.F32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        MOV        pointStep,grpSize,LSL #3
+        RSB        step,pointStep,#8
+
+
+        @// Loop on the sets for grp zero
+
+grpZeroSetLoop\name :
+
+        VLD1    dX0,[pSrc],pointStep
+        VLD1    dX1,[pSrc],step                   @// step = -pointStep + 8
+        SUBS    setCount,setCount,#1
+
+        VADD    dY0,dX0,dX1
+        VSUB    dY1,dX0,dX1
+
+        VST1    dY0,[pDst],outPointStep
+        @// dstStep =  step = -pointStep + 8
+        VST1    dY1,[pDst],dstStep
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+	.end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S
@@ -0,0 +1,153 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+@// stage for a N point complex signal.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep    r3
+#define grpCount        r4
+#define dstStep         r5
+#define pTmp            r4
+
+@// Neon Registers
+
+#define dWr     d0.f32
+#define dWi     d1.f32
+#define dXr0    d2.f32
+#define dXi0    d3.f32
+#define dXr1    d4.f32
+#define dXi1    d5.f32
+#define dYr0    d6.f32
+#define dYi0    d7.f32
+#define dYr1    d8.f32
+#define dYi1    d9.f32
+#define qT0     d10.f32
+#define qT1     d12.f32
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+
+        MOV     outPointStep,subFFTSize,LSL #3
+        @// Update grpCount and grpSize rightaway
+
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        RSB      dstStep,outPointStep,#16
+
+
+        @// Loop on 2 grps at a time for the last stage
+
+radix2lsGrpLoop\name :
+        @ dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
+        @ dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
+        VLD2    {dWr,dWi},[pTwiddle :64]!
+
+        @ dXr0 = [pSrc[0].Re, pSrc[2].Re]
+        @ dXi0 = [pSrc[0].Im, pSrc[2].Im]
+        @ dXr1 = [pSrc[1].Re, pSrc[3].Re]
+        @ dXi1 = [pSrc[1].Im, pSrc[3].Im]
+        VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
+        SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   qT0,dWr,dXr1
+            VMLA   qT0,dWi,dXi1                       @// real part
+            VMUL   qT1,dWr,dXi1
+            VMLS   qT1,dWi,dXr1                       @// imag part
+
+        .else
+
+            VMUL   qT0,dWr,dXr1
+            VMLS   qT0,dWi,dXi1                       @// real part
+            VMUL   qT1,dWr,dXi1
+            VMLA   qT1,dWi,dXr1                       @// imag part
+
+        .endif
+
+        VSUB    dYr0,dXr0,qT0
+        VSUB    dYi0,dXi0,qT1
+        VADD    dYr1,dXr0,qT0
+        VADD    dYi1,dXi0,qT1
+
+        VST2    {dYr0,dYi0},[pDst],outPointStep
+        VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
+
+        BGT     radix2lsGrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 4*size; pSrc -= 8*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 4*size bytes
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe,r4,""
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+	.end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S
@@ -0,0 +1,191 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@// Description:
+@// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
+@// complex signal.  This handles the general stage, not the first or last
+@// stage.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep    r3
+#define pointStep       r4
+#define grpCount        r5
+#define setCount        r8
+@//const           RN  9
+#define step            r10
+#define dstStep         r11
+#define pTable          r9
+#define pTmp            r9
+
+@// Neon Registers
+
+#define dW      D0.F32
+#define dX0     D2.F32
+#define dX1     D3.F32
+#define dX2     D4.F32
+#define dX3     D5.F32
+#define dY0     D6.F32
+#define dY1     D7.F32
+#define dY2     D8.F32
+#define dY3     D9.F32
+#define qT0     D10.F32
+#define qT1     D11.F32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+        @// and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+        LSL     grpCount,subFFTSize,#1
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #2
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes =
+        @//    4*size bytes
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#1
+
+
+        RSB      step,pointStep,#16
+        RSB      dstStep,outPointStep,#16
+
+        @// Loop on the groups
+
+radix2GrpLoop\name :
+        MOV      setCount,pointStep,LSR #3
+        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
+
+
+        @// Loop on the sets
+
+
+radix2SetLoop\name :
+
+
+        @// point0: dX0-real part dX1-img part
+        VLD2    {dX0,dX1},[pSrc],pointStep
+        @// point1: dX2-real part dX3-img part
+        VLD2    {dX2,dX3},[pSrc],step
+
+        SUBS    setCount,setCount,#2
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   qT0,dX2,dW[0]
+            VMLA   qT0,dX3,dW[1]                       @// real part
+            VMUL   qT1,dX3,dW[0]
+            VMLS   qT1,dX2,dW[1]                       @// imag part
+
+        .else
+
+            VMUL   qT0,dX2,dW[0]
+            VMLS   qT0,dX3,dW[1]                       @// real part
+            VMUL   qT1,dX3,dW[0]
+            VMLA   qT1,dX2,dW[1]                       @// imag part
+
+        .endif
+
+        VSUB    dY0,dX0,qT0
+        VSUB    dY1,dX1,qT1
+        VADD    dY2,dX0,qT0
+        VADD    dY3,dX1,qT1
+
+        VST2    {dY0,dY1},[pDst],outPointStep
+        @// dstStep = -outPointStep + 16
+        VST2    {dY2,dY3},[pDst],dstStep
+
+        BGT     radix2SetLoop\name
+
+        SUBS    grpCount,grpCount,#2
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix2GrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        @// pDst -= 4*size; pSrc -= 8*size bytes
+        SUB     pDst,pSrc,outPointStep,LSL #1
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        @// pTwiddle -= 4*size bytes
+        SUB     pTwiddle,pTwiddle,outPointStep
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
@@ -0,0 +1,251 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define pPingPongBuf    r5
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize         r3
+@// Reuse grpSize as setCount
+#define setCount        r3
+#define pointStep       r4
+#define outPointStep    r4
+#define setStep         r8
+#define step1           r9
+#define step3           r10
+
+@// Neon Registers
+
+#define dXr0    D0.F32
+#define dXi0    D1.F32
+#define dXr1    D2.F32
+#define dXi1    D3.F32
+#define dXr2    D4.F32
+#define dXi2    D5.F32
+#define dXr3    D6.F32
+#define dXi3    D7.F32
+#define dYr0    D8.F32
+#define dYi0    D9.F32
+#define dYr1    D10.F32
+#define dYi1    D11.F32
+#define dYr2    D12.F32
+#define dYi2    D13.F32
+#define dYr3    D14.F32
+#define dYi3    D15.F32
+#define qX0     Q0.F32
+#define qX1     Q1.F32
+#define qX2     Q2.F32
+#define qX3     Q3.F32
+#define qY0     Q4.F32
+#define qY1     Q5.F32
+#define qY2     Q6.F32
+#define qY3     Q7.F32
+#define dZr0    D16.F32
+#define dZi0    D17.F32
+#define dZr1    D18.F32
+#define dZi1    D19.F32
+#define dZr2    D20.F32
+#define dZi2    D21.F32
+#define dZr3    D22.F32
+#define dZi3    D23.F32
+#define qZ0     Q8.F32
+#define qZ1     Q9.F32
+#define qZ2     Q10.F32
+#define qZ3     Q11.F32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+
+        MOV     pointStep,subFFTNum,LSL #1
+
+
+        @// Update pSubFFTSize and pSubFFTNum regs
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        @// subFFTSize = 1 for the first stage
+        MOV     subFFTSize,#4
+
+        @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        MOV     subFFTNum,grpSize
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #4
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        @// setStep = 3*pointStep
+        ADD     setStep,setStep,pointStep
+        @// setStep = - 3*pointStep+16
+        RSB     setStep,setStep,#16
+
+        @//  data[3] & update pSrc for the next set
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep
+        @// step1 = 2*pointStep
+        MOV     step1,pointStep,LSL #1
+
+        VADD    qY0,qX0,qX2
+
+        @// step3 = -pointStep
+        RSB     step3,pointStep,#0
+
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        @// finish first stage of 4 point FFT
+
+
+        VSUB    qY2,qX0,qX2
+
+        VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+        VADD    qY1,qX1,qX3
+        VLD2    {dXr2,dXi2},[pSrc :128],step3          @//  data[2]
+        VSUB    qY3,qX1,qX3
+
+
+        @// finish second stage of 4 point FFT
+
+        .ifeqs "\inverse", "TRUE"
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VADD    qZ0,qY0,qY1
+
+            @//  data[3] & update pSrc for the next set, but not if it's the
+            @//  last iteration so that we don't read past the end of the 
+            @//  input array.
+            BEQ     radix4SkipLastUpdateInv\name
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+radix4SkipLastUpdateInv\name:
+            VSUB    dZr3,dYr2,dYi3
+
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VADD    dZi3,dYi2,dYr3
+
+            VSUB    qZ1,qY0,qY1
+            VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+            VADD    dZr2,dYr2,dYi3
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            VSUB    dZi2,dYi2,dYr3
+
+            VADD    qY0,qX0,qX2                     @// u0 for next iteration
+            VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+        .else
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VADD    qZ0,qY0,qY1
+
+            @//  data[3] & update pSrc for the next set, but not if it's the
+            @//  last iteration so that we don't read past the end of the 
+            @//  input array.
+            BEQ     radix4SkipLastUpdateFwd\name
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+radix4SkipLastUpdateFwd\name:
+            VADD    dZr2,dYr2,dYi3
+
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VSUB    dZi2,dYi2,dYr3
+
+            VSUB    qZ1,qY0,qY1
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
+            VSUB    dZr3,dYr2,dYi3
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            VADD    dZi3,dYi2,dYr3
+
+            VADD    qY0,qX0,qX2                     @// u0 for next iteration
+            VST2    {dZr3,dZi3},[pDst :128],setStep
+
+        .endif
+
+        BGT     radix4fsGrpZeroSetLoop\name
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
@@ -0,0 +1,339 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+    @//IMPORT  armAAC_constTable
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep    r3
+#define grpCount        r4
+#define dstStep         r5
+#define grpTwStep       r8
+#define stepTwiddle     r9
+#define twStep          r10
+#define pTmp            r4
+#define step16          r11
+#define step24          r12
+
+
+@// Neon Registers
+
+#define dButterfly1Real02       D0.F32
+#define dButterfly1Imag02       D1.F32
+#define dButterfly1Real13       D2.F32
+#define dButterfly1Imag13       D3.F32
+#define dButterfly2Real02       D4.F32
+#define dButterfly2Imag02       D5.F32
+#define dButterfly2Real13       D6.F32
+#define dButterfly2Imag13       D7.F32
+#define dXr0                    D0.F32
+#define dXi0                    D1.F32
+#define dXr1                    D2.F32
+#define dXi1                    D3.F32
+#define dXr2                    D4.F32
+#define dXi2                    D5.F32
+#define dXr3                    D6.F32
+#define dXi3                    D7.F32
+
+#define dYr0                    D16.F32
+#define dYi0                    D17.F32
+#define dYr1                    D18.F32
+#define dYi1                    D19.F32
+#define dYr2                    D20.F32
+#define dYi2                    D21.F32
+#define dYr3                    D22.F32
+#define dYi3                    D23.F32
+
+#define dW1r                    D8.F32
+#define dW1i                    D9.F32
+#define dW2r                    D10.F32
+#define dW2i                    D11.F32
+#define dW3r                    D12.F32
+#define dW3i                    D13.F32
+#define qT0                     d14.f32
+#define qT1                     d16.F32
+#define qT2                     d18.F32
+#define qT3                     d20.f32
+#define qT4                     d22.f32
+#define qT5                     d24.f32
+
+#define dZr0                    D14.F32
+#define dZi0                    D15.F32
+#define dZr1                    D26.F32
+#define dZi1                    D27.F32
+#define dZr2                    D28.F32
+#define dZi2                    D29.F32
+#define dZr3                    D30.F32
+#define dZi3                    D31.F32
+
+#define qX0                     Q0.F32
+#define qY0                     Q8.F32
+#define qY1                     Q9.F32
+#define qY2                     Q10.F32
+#define qY3                     Q11.F32
+#define qZ0                     Q7.F32
+#define qZ1                     Q13.F32
+#define qZ2                     Q14.F32
+#define qZ3                     Q15.F32
+
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes
+        MOV     outPointStep,subFFTSize,LSL #3
+
+        @// Update grpCount and grpSize rightaway
+
+        VLD2    {dW1r,dW1i},[pTwiddle :128]             @// [wi|wr]
+        MOV     step16,#16
+        LSL     grpCount,subFFTSize,#2
+
+        VLD1    dW2r,[pTwiddle :64]                     @// [wi|wr]
+        MOV     subFFTNum,#1                            @//after the last stage
+
+        VLD1    dW3r,[pTwiddle :64],step16              @// [wi|wr]
+        MOV     stepTwiddle,#0
+
+        VLD1    dW2i,[pTwiddle :64]!                    @// [wi|wr]
+        SUB     grpTwStep,stepTwiddle,#8                @// grpTwStep = -8 to start with
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
+        MOV     dstStep,outPointStep,LSL #1
+
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
+        ADD     dstStep,dstStep,outPointStep            @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                     @// dstStep = - 3*outPointStep+16
+        MOV     step24,#24
+
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
+
+
+        @// Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+        VZIP    dW2r,dW2i
+        ADD     stepTwiddle,stepTwiddle,#16
+        VZIP    dW3r,dW3i
+        ADD     grpTwStep,stepTwiddle,#4
+        VUZP     dButterfly1Real13, dButterfly2Real13   @// B.r D.r
+        SUB     twStep,stepTwiddle,#16                  @// -16+stepTwiddle
+        VUZP     dButterfly1Imag13, dButterfly2Imag13   @// B.i D.i
+        MOV     grpTwStep,grpTwStep,LSL #1
+        VUZP     dButterfly1Real02, dButterfly2Real02   @// A.r C.r
+        RSB     grpTwStep,grpTwStep,#0                  @// -8-2*stepTwiddle
+
+
+        VUZP     dButterfly1Imag02, dButterfly2Imag02   @// A.i C.i
+
+
+        @// grpCount is multiplied by 4
+        SUBS    grpCount,grpCount,#8
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr1,dW1r,dXr1
+            VMLA   dZr1,dW1i,dXi1                       @// real part
+            VMUL   dZi1,dW1r,dXi1
+            VMLS   dZi1,dW1i,dXr1                       @// imag part
+
+        .else
+
+            VMUL   dZr1,dW1r,dXr1
+            VMLS   dZr1,dW1i,dXi1                       @// real part
+            VMUL   dZi1,dW1r,dXi1
+            VMLA   dZi1,dW1i,dXr1                       @// imag part
+
+        .endif
+
+        VLD2    {dW1r,dW1i},[pTwiddle :128],stepTwiddle      @// [wi|wr]
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr2,dW2r,dXr2
+            VMLA   dZr2,dW2i,dXi2                       @// real part
+            VMUL   dZi2,dW2r,dXi2
+            VLD1   dW2r,[pTwiddle :64],step16           @// [wi|wr]
+            VMLS   dZi2,dW2i,dXr2                       @// imag part
+
+        .else
+
+            VMUL   dZr2,dW2r,dXr2
+            VMLS   dZr2,dW2i,dXi2                       @// real part
+            VMUL   dZi2,dW2r,dXi2
+            VLD1    dW2r,[pTwiddle :64],step16          @// [wi|wr]
+            VMLA   dZi2,dW2i,dXr2                       @// imag part
+
+        .endif
+
+
+        VLD1    dW2i,[pTwiddle :64],twStep              @// [wi|wr]
+
+        @// move qX0 so as to load for the next iteration
+        VMOV     qZ0,qX0
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr3,dW3r,dXr3
+            VMLA   dZr3,dW3i,dXi3                       @// real part
+            VMUL   dZi3,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLS   dZi3,dW3i,dXr3                       @// imag part
+
+        .else
+
+            VMUL   dZr3,dW3r,dXr3
+            VMLS   dZr3,dW3i,dXi3                       @// real part
+            VMUL   dZi3,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLA   dZi3,dW3i,dXr3                       @// imag part
+
+        .endif
+
+        VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
+
+        @// Don't do the load on the last iteration so we don't read past the end
+        @// of pSrc.
+        addeq   pSrc, pSrc, #64
+        beq     radix4lsSkipRead\name
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
+
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
+radix4lsSkipRead\name:
+
+        @// finish first stage of 4 point FFT
+
+        VADD    qY0,qZ0,qZ2
+        VSUB    qY2,qZ0,qZ2
+        VADD    qY1,qZ1,qZ3
+        VSUB    qY3,qZ1,qZ3
+
+
+        @// finish second stage of 4 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            VSUB    qZ0,qY2,qY1
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            @// dstStep = -outPointStep + 16
+            VST2    {dZr1,dZi1},[pDst :128],dstStep
+
+
+        .else
+
+            VSUB    qZ0,qY2,qY1
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            @// dstStep = -outPointStep + 16
+            VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+        .endif
+
+        BGT     radix4lsGrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        @// Extra increment done in final iteration of the loop
+        SUB     pSrc,pSrc,#64
+        @// pDst -= 4*size; pSrc -= 8*size bytes
+        SUB     pDst,pSrc,outPointStep,LSL #2
+        SUB     pSrc,pTmp,outPointStep
+        SUB     pTwiddle,pTwiddle,subFFTSize,LSL #1
+        @// Extra increment done in final iteration of the loop
+        SUB     pTwiddle,pTwiddle,#16
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
@@ -0,0 +1,331 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount        r3
+#define pointStep       r4
+#define outPointStep    r5
+#define stepTwiddle     r12
+#define setCount        r14
+#define srcStep         r8
+#define setStep         r9
+#define dstStep         r10
+#define twStep          r11
+#define t1              r3
+
+@// Neon Registers
+
+#define dW1     D0.F32
+#define dW2     D1.F32
+#define dW3     D2.F32
+
+#define dXr0    D4.F32
+#define dXi0    D5.F32
+#define dXr1    D6.F32
+#define dXi1    D7.F32
+#define dXr2    D8.F32
+#define dXi2    D9.F32
+#define dXr3    D10.F32
+#define dXi3    D11.F32
+#define dYr0    D12.F32
+#define dYi0    D13.F32
+#define dYr1    D14.F32
+#define dYi1    D15.F32
+#define dYr2    D16.F32
+#define dYi2    D17.F32
+#define dYr3    D18.F32
+#define dYi3    D19.F32
+#define qT0     d16.f32
+#define qT1     d18.f32
+#define qT2     d12.f32
+#define qT3     d14.f32
+#define dZr0    D20.F32
+#define dZi0    D21.F32
+#define dZr1    D22.F32
+#define dZi1    D23.F32
+#define dZr2    D24.F32
+#define dZi2    D25.F32
+#define dZr3    D26.F32
+#define dZi3    D27.F32
+
+#define qY0     Q6.F32
+#define qY1     Q7.F32
+#define qY2     Q8.F32
+#define qY3     Q9.F32
+#define qX0     Q2.F32
+#define qZ0     Q10.F32
+#define qZ1     Q11.F32
+#define qZ2     Q12.F32
+#define qZ3     Q13.F32
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse
+        @// pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+        VLD1     dW1,[pTwiddle]                    @//[wi | wr]
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #1
+
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes
+        @//   = 2*size bytes
+
+        MOV     stepTwiddle,#0
+        VLD1     dW2,[pTwiddle]                    @//[wi | wr]
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#2             @// 2*grpSize
+
+        VLD1     dW3,[pTwiddle]                    @//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1           @// srcStep = 2*pointStep
+        ADD     setStep,srcStep,pointStep          @// setStep = 3*pointStep
+
+        RSB     setStep,setStep,#0                 @// setStep = - 3*pointStep
+        SUB     srcStep,srcStep,#16                @// srcStep = 2*pointStep-16
+
+        MOV     dstStep,outPointStep,LSL #1
+        ADD     dstStep,dstStep,outPointStep       @// dstStep = 3*outPointStep
+        @// dstStep = - 3*outPointStep+16
+        RSB     dstStep,dstStep,#16
+
+
+
+radix4GrpLoop\name :
+
+        VLD2    {dXr0,dXi0},[pSrc],pointStep       @//  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        VLD2    {dXr1,dXi1},[pSrc],pointStep       @//  data[1]
+        @// set pTwiddle to the first point
+        ADD      pTwiddle,pTwiddle,stepTwiddle
+        VLD2    {dXr2,dXi2},[pSrc],pointStep       @//  data[2]
+        MOV      twStep,stepTwiddle,LSL #2
+
+        @//  data[3] & update pSrc for the next set
+        VLD2    {dXr3,dXi3},[pSrc],setStep
+        SUB      twStep,stepTwiddle,twStep         @// twStep = -3*stepTwiddle
+
+        MOV      setCount,pointStep,LSR #3
+        @// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,#16
+        @// increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+
+
+        @// Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr1,dXr1,dW1[0]
+            VMUL   dZi1,dXi1,dW1[0]
+            VMUL   dZr2,dXr2,dW2[0]
+            VMUL   dZi2,dXi2,dW2[0]
+            VMUL   dZr3,dXr3,dW3[0]
+            VMUL   dZi3,dXi3,dW3[0]
+
+            VMLA   dZr1,dXi1,dW1[1]                @// real part
+            VMLS   dZi1,dXr1,dW1[1]                @// imag part
+
+            @//  data[1] for next iteration
+            VLD2    {dXr1,dXi1},[pSrc],pointStep
+
+            VMLA   dZr2,dXi2,dW2[1]                @// real part
+            VMLS   dZi2,dXr2,dW2[1]                @// imag part
+
+            @//  data[2] for next iteration
+            VLD2    {dXr2,dXi2},[pSrc],pointStep
+
+            VMLA   dZr3,dXi3,dW3[1]                @// real part
+            VMLS   dZi3,dXr3,dW3[1]                @// imag part
+        .else
+            VMUL   dZr1,dXr1,dW1[0]
+            VMUL   dZi1,dXi1,dW1[0]
+            VMUL   dZr2,dXr2,dW2[0]
+            VMUL   dZi2,dXi2,dW2[0]
+            VMUL   dZr3,dXr3,dW3[0]
+            VMUL   dZi3,dXi3,dW3[0]
+
+            VMLS   dZr1,dXi1,dW1[1]                @// real part
+            VMLA   dZi1,dXr1,dW1[1]                @// imag part
+
+            @//  data[1] for next iteration
+            VLD2    {dXr1,dXi1},[pSrc],pointStep
+
+            VMLS   dZr2,dXi2,dW2[1]                @// real part
+            VMLA   dZi2,dXr2,dW2[1]                @// imag part
+
+            @//  data[2] for next iteration
+            VLD2    {dXr2,dXi2},[pSrc],pointStep
+
+            VMLS   dZr3,dXi3,dW3[1]                @// real part
+            VMLA   dZi3,dXr3,dW3[1]                @// imag part
+        .endif
+
+        @//  data[3] & update pSrc to data[0]
+        @// But don't read on the very last iteration because that reads past 
+	@// the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+        cmp     grpCount, #4
+        cmpeq   setCount, #2                      @// Test setCount if grpCount = 4
+        @// These are executed only if both grpCount = 4 and setCount = 2       
+        addeq   pSrc, pSrc, setStep
+        beq     radix4SkipRead\name
+        VLD2    {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+        SUBS    setCount,setCount,#2
+
+        @// finish first stage of 4 point FFT
+        VADD    qY0,qX0,qZ2
+        VSUB    qY2,qX0,qZ2
+
+        @//  data[0] for next iteration
+        VLD2    {dXr0,dXi0},[pSrc :128]!
+        VADD    qY1,qZ1,qZ3
+        VSUB    qY3,qZ1,qZ3
+
+        @// finish second stage of 4 point FFT
+
+        VSUB    qZ0,qY2,qY1
+
+
+        .ifeqs  "\inverse", "TRUE"
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VST2    {dZr1,dZi1},[pDst :128],dstStep
+
+
+        .else
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+        .endif
+
+        @// increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix4SetLoop\name
+
+
+        VLD1     dW1,[pTwiddle :64],stepTwiddle    @//[wi | wr]
+        @// subtract 4 since grpCount multiplied by 4
+        SUBS    grpCount,grpCount,#4
+        VLD1     dW2,[pTwiddle :64],stepTwiddle    @//[wi | wr]
+        @// increment pSrc for the next grp
+        ADD     pSrc,pSrc,srcStep
+        VLD1     dW3,[pTwiddle :64],twStep         @//[wi | wr]
+        BGT     radix4GrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        @// pDst -= 2*size; pSrc -= 8*size bytes
+        SUB     pDst,pSrc,outPointStep,LSL #2
+        SUB     pSrc,t1,outPointStep
+
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
@@ -0,0 +1,426 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+@// dest buffer for the next stage (not pSrc for first stage)
+#define pPingPongBuf    r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize         r3
+@// Reuse grpSize as setCount
+#define setCount        r3
+#define pointStep       r4
+#define outPointStep    r4
+#define setStep         r8
+#define step1           r9
+#define step2           r10
+#define t0              r11
+
+
+@// Neon Registers
+
+#define dXr0    D0.F32
+#define dXi0    D1.F32
+#define dXr1    D2.F32
+#define dXi1    D3.F32
+#define dXr2    D4.F32
+#define dXi2    D5.F32
+#define dXr3    D6.F32
+#define dXi3    D7.F32
+#define dXr4    D8.F32
+#define dXi4    D9.F32
+#define dXr5    D10.F32
+#define dXi5    D11.F32
+#define dXr6    D12.F32
+#define dXi6    D13.F32
+#define dXr7    D14.F32
+#define dXi7    D15.F32
+#define qX0     Q0.F32
+#define qX1     Q1.F32
+#define qX2     Q2.F32
+#define qX3     Q3.F32
+#define qX4     Q4.F32
+#define qX5     Q5.F32
+#define qX6     Q6.F32
+#define qX7     Q7.F32
+
+#define dUr0    D16.F32
+#define dUi0    D17.F32
+#define dUr2    D18.F32
+#define dUi2    D19.F32
+#define dUr4    D20.F32
+#define dUi4    D21.F32
+#define dUr6    D22.F32
+#define dUi6    D23.F32
+#define dUr1    D24.F32
+#define dUi1    D25.F32
+#define dUr3    D26.F32
+#define dUi3    D27.F32
+#define dUr5    D28.F32
+#define dUi5    D29.F32
+@// reuse dXr7 and dXi7
+#define dUr7    D30.F32
+#define dUi7    D31.F32
+#define qU0     Q8.F32
+#define qU1     Q12.F32
+#define qU2     Q9.F32
+#define qU3     Q13.F32
+#define qU4     Q10.F32
+#define qU5     Q14.F32
+#define qU6     Q11.F32
+#define qU7     Q15.F32
+
+
+#define dVr0    D24.F32
+#define dVi0    D25.F32
+#define dVr2    D26.F32
+#define dVi2    D27.F32
+#define dVr4    D28.F32
+#define dVi4    D29.F32
+#define dVr6    D30.F32
+#define dVi6    D31.F32
+#define dVr1    D16.F32
+#define dVi1    D17.F32
+#define dVr3    D18.F32
+#define dVi3    D19.F32
+#define dVr5    D20.F32
+#define dVi5    D21.F32
+#define dVr7    D22.F32
+#define dVi7    D23.F32
+#define qV0     Q12.F32
+#define qV1     Q8.F32
+#define qV2     Q13.F32
+#define qV3     Q9.F32
+#define qV4     Q14.F32
+#define qV5     Q10.F32
+#define qV6     Q15.F32
+#define qV7     Q11.F32
+
+#define dYr0    D16.F32
+#define dYi0    D17.F32
+#define dYr2    D18.F32
+#define dYi2    D19.F32
+#define dYr4    D20.F32
+#define dYi4    D21.F32
+#define dYr6    D22.F32
+#define dYi6    D23.F32
+#define dYr1    D24.F32
+#define dYi1    D25.F32
+#define dYr3    D26.F32
+#define dYi3    D27.F32
+#define dYr5    D28.F32
+#define dYi5    D29.F32
+#define dYr7    D30.F32
+#define dYi7    D31.F32
+#define qY0     Q8.F32
+#define qY1     Q12.F32
+#define qY2     Q9.F32
+#define qY3     Q13.F32
+#define qY4     Q10.F32
+#define qY5     Q14.F32
+#define qY6     Q11.F32
+#define qY7     Q15.F32
+
+#define dT0     D14.F32
+#define dT1     D15.F32
+
+@// Define constants
+        @ sqrt(1/2)
+ONEBYSQRT2:     .float  0.7071067811865476e0
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+        @// Update pSubFFTSize and pSubFFTNum regs
+        @// subFFTSize = 1 for the first stage
+        MOV     subFFTSize,#8
+        LDR     t0,=ONEBYSQRT2
+
+        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
+        MOV     subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+
+        MOV     pointStep,grpSize,LSL #3
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     step1,pointStep,LSL #1             @// step1 = 2*pointStep
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep     @//  data[0]
+        MOV     step1,grpSize,LSL #4
+
+        MOV     step2,pointStep,LSL #3
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep     @//  data[1]
+        SUB     step2,step2,pointStep                 @// step2 = 7*pointStep
+        @// setStep = - 7*pointStep+16
+        RSB     setStep,step2,#16
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep     @//  data[2]
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep     @//  data[3]
+        VLD2    {dXr4,dXi4},[pSrc :128],pointStep     @//  data[4]
+        VLD2    {dXr5,dXi5},[pSrc :128],pointStep     @//  data[5]
+        VLD2    {dXr6,dXi6},[pSrc :128],pointStep     @//  data[6]
+        @//  data[7] & update pSrc for the next set
+        @//  setStep = -7*pointStep + 16
+        VLD2    {dXr7,dXi7},[pSrc :128],setStep
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        @// finish first stage of 8 point FFT
+
+        VADD    qU0,qX0,qX4
+        VADD    qU2,qX1,qX5
+        VADD    qU4,qX2,qX6
+        VADD    qU6,qX3,qX7
+
+        @// finish second stage of 8 point FFT
+
+        VADD    qV0,qU0,qU4
+        VSUB    qV2,qU0,qU4
+        VADD    qV4,qU2,qU6
+        VSUB    qV6,qU2,qU6
+
+        @// finish third stage of 8 point FFT
+
+        VADD    qY0,qV0,qV4
+        VSUB    qY4,qV0,qV4
+        VST2    {dYr0,dYi0},[pDst :128],step1         @// store y0
+
+        .ifeqs  "\inverse", "TRUE"
+
+            VSUB    dYr2,dVr2,dVi6
+            VADD    dYi2,dVi2,dVr6
+
+            VADD    dYr6,dVr2,dVi6
+            VST2    {dYr2,dYi2},[pDst :128],step1     @// store y2
+            VSUB    dYi6,dVi2,dVr6
+
+            VSUB    qU1,qX0,qX4
+            VST2    {dYr4,dYi4},[pDst :128],step1     @// store y4
+
+            VSUB    qU3,qX1,qX5
+            VSUB    qU5,qX2,qX6
+            VST2    {dYr6,dYi6},[pDst :128],step1     @// store y6
+
+        .ELSE
+
+            VADD    dYr6,dVr2,dVi6
+            VSUB    dYi6,dVi2,dVr6
+
+            VSUB    dYr2,dVr2,dVi6
+            VST2    {dYr6,dYi6},[pDst :128],step1     @// store y2
+            VADD    dYi2,dVi2,dVr6
+
+
+            VSUB    qU1,qX0,qX4
+            VST2    {dYr4,dYi4},[pDst :128],step1     @// store y4
+            VSUB    qU3,qX1,qX5
+            VSUB    qU5,qX2,qX6
+            VST2    {dYr2,dYi2},[pDst :128],step1     @// store y6
+
+
+        .ENDIF
+
+        @// finish first stage of 8 point FFT
+
+        VSUB    qU7,qX3,qX7
+        VLD1    dT0[0], [t0]
+
+        @// finish second stage of 8 point FFT
+
+        VSUB    dVr1,dUr1,dUi5
+        @//  data[0] for next iteration
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep
+        VADD    dVi1,dUi1,dUr5
+        VADD    dVr3,dUr1,dUi5
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep     @//  data[1]
+        VSUB    dVi3,dUi1,dUr5
+
+        VSUB    dVr5,dUr3,dUi7
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep     @//  data[2]
+        VADD    dVi5,dUi3,dUr7
+        VADD    dVr7,dUr3,dUi7
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep     @//  data[3]
+        VSUB    dVi7,dUi3,dUr7
+
+        @// finish third stage of 8 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            @// calculate a*v5
+            VMUL    dT1,dVr5,dT0[0]                   @// use dVi0 for dT1
+
+            VLD2    {dXr4,dXi4},[pSrc :128],pointStep @//  data[4]
+            VMUL    dVi5,dVi5,dT0[0]
+
+            VLD2    {dXr5,dXi5},[pSrc :128],pointStep @//  data[5]
+            VSUB    dVr5,dT1,dVi5                     @// a * V5
+            VADD    dVi5,dT1,dVi5
+
+            VLD2    {dXr6,dXi6},[pSrc :128],pointStep @//  data[6]
+
+            @// calculate  b*v7
+            VMUL    dT1,dVr7,dT0[0]
+            VMUL    dVi7,dVi7,dT0[0]
+
+            VADD    qY1,qV1,qV5
+            VSUB    qY5,qV1,qV5
+
+
+            VADD    dVr7,dT1,dVi7                     @// b * V7
+            VSUB    dVi7,dVi7,dT1
+            SUB     pDst, pDst, step2                 @// set pDst to y1
+
+            @// On the last iteration,  this will read past the end of pSrc, 
+            @// so skip this read.
+            BEQ     radix8SkipLastUpdateInv\name
+            VLD2    {dXr7,dXi7},[pSrc :128],setStep   @//  data[7]
+radix8SkipLastUpdateInv\name:
+
+            VSUB    dYr3,dVr3,dVr7
+            VSUB    dYi3,dVi3,dVi7
+            VST2    {dYr1,dYi1},[pDst :128],step1     @// store y1
+            VADD    dYr7,dVr3,dVr7
+            VADD    dYi7,dVi3,dVi7
+
+
+            VST2    {dYr3,dYi3},[pDst :128],step1     @// store y3
+            VST2    {dYr5,dYi5},[pDst :128],step1     @// store y5
+            VST2    {dYr7,dYi7},[pDst :128]           @// store y7
+            ADD pDst, pDst, #16
+
+        .ELSE
+
+            @// calculate  b*v7
+            VMUL    dT1,dVr7,dT0[0]
+            VLD2    {dXr4,dXi4},[pSrc :128],pointStep @//  data[4]
+            VMUL    dVi7,dVi7,dT0[0]
+
+            VLD2    {dXr5,dXi5},[pSrc :128],pointStep @//  data[5]
+            VADD    dVr7,dT1,dVi7                     @// b * V7
+            VSUB    dVi7,dVi7,dT1
+
+            VLD2    {dXr6,dXi6},[pSrc :128],pointStep @//  data[6]
+
+            @// calculate a*v5
+            VMUL    dT1,dVr5,dT0[0]                   @// use dVi0 for dT1
+            VMUL    dVi5,dVi5,dT0[0]
+
+            VADD    dYr7,dVr3,dVr7
+            VADD    dYi7,dVi3,dVi7
+            SUB     pDst, pDst, step2                 @// set pDst to y1
+
+            VSUB    dVr5,dT1,dVi5                     @// a * V5
+            VADD    dVi5,dT1,dVi5
+
+            @// On the last iteration,  this will read past the end of pSrc, 
+            @// so skip this read.
+            BEQ     radix8SkipLastUpdateFwd\name
+            VLD2    {dXr7,dXi7},[pSrc :128],setStep   @//  data[7]
+radix8SkipLastUpdateFwd\name:
+
+            VSUB    qY5,qV1,qV5
+
+            VSUB    dYr3,dVr3,dVr7
+            VST2    {dYr7,dYi7},[pDst :128],step1     @// store y1
+            VSUB    dYi3,dVi3,dVi7
+            VADD    qY1,qV1,qV5
+
+
+            VST2    {dYr5,dYi5},[pDst :128],step1     @// store y3
+            VST2    {dYr3,dYi3},[pDst :128],step1     @// store y5
+            VST2    {dYr1,dYi1},[pDst :128]!          @// store y7
+
+        .ENDIF
+
+
+        @// update pDst for the next set
+        SUB     pDst, pDst, step2
+        BGT     radix8fsGrpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                   @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+
+
+        .endm
+
+
+        @// Allocate stack memory required by the function
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        .end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
@@ -0,0 +1,170 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6693
+@// Last Modified Date:       Tue, 10 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define pPingPongBuf                    r5
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep                       r3
+#define outPointStep                    r3
+#define grpSize                         r4
+#define setCount                        r4
+#define step                            r8
+#define dstStep                         r8
+
+@// Neon Registers
+
+#define dX0                             D0.S16
+#define dX1                             D1.S16
+#define dY0                             D2.S16
+#define dY1                             D3.S16
+#define dX0S32                          D0.S32
+#define dX1S32                          D1.S32
+#define dY0S32                          D2.S32
+#define dY1S32                          D3.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        MOV        pointStep,grpSize,LSL #2
+        RSB        step,pointStep,#4
+
+
+        @// Loop on the sets for grp zero: 1 set at a time
+
+grpZeroSetLoop\name:
+
+        VLD1    {dX0S32[0]},[pSrc],pointStep
+        VLD1    {dX1S32[0]},[pSrc],step                   @// step = -pointStep + 4
+        SUBS    setCount,setCount,#1              @// decrement the loop counter
+
+        .ifeqs "\scaled", "TRUE"
+
+            VHADD    dY0,dX0,dX1
+            VHSUB    dY1,dX0,dX1
+
+        .ELSE
+
+            VADD    dY0,dX0,dX1
+            VSUB    dY1,dX0,dX1
+
+
+        .ENDIF
+
+        VST1    {dY0S32[0]},[pDst],outPointStep
+        VST1    {dY1S32[0]},[pDst],dstStep                  @// dstStep =  step = -pointStep + 4
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
@@ -0,0 +1,210 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6741
+@// Last Modified Date:       Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define pTmp                            r4
+#define step                            r8
+
+@// Neon Registers
+
+#define dWr                             D0.S16
+#define dWi                             D1.S16
+#define dXr0                            D2.S16
+#define dXi0                            D3.S16
+#define dXr1                            D4.S16
+#define dXi1                            D5.S16
+#define dYr0                            D6.S16
+#define dYi0                            D7.S16
+#define dYr1                            D8.S16
+#define dYi1                            D9.S16
+#define qT0                             Q5.S32
+#define qT1                             Q6.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+
+        MOV     outPointStep,subFFTSize,LSL #2
+        @// Update grpCount and grpSize rightaway
+
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        SUB      step,outPointStep,#4                   @// step = -4+outPointStep
+        RSB      dstStep,step,#0                        @// dstStep = -4-outPointStep+8 = -step
+        @//RSB      dstStep,outPointStep,#16
+
+
+        @// Loop on 2 grps at a time for the last stage
+
+grpLoop\name:
+        VLD2    {dWr[0],dWi[0]},[pTwiddle]!             @// grp 0
+        VLD2    {dWr[1],dWi[1]},[pTwiddle]!             @// grp 1
+
+        @//VLD2    {dWr,dWi},[pTwiddle],#16
+
+        VLD4    {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]!   @// grp 0
+        VLD4    {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]!   @// grp 1
+
+
+        @//VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
+        SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr1,dWr
+            VMLAL   qT0,dXi1,dWi                       @// real part
+            VMULL   qT1,dXi1,dWr
+            VMLSL   qT1,dXr1,dWi                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr1,dWr
+            VMLSL   qT0,dXi1,dWi                       @// real part
+            VMULL   qT1,dXi1,dWr
+            VMLAL   qT1,dXr1,dWi                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dXr1,qT0,#15
+        VRSHRN  dXi1,qT1,#15
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            VHSUB    dYr0,dXr0,dXr1
+            VHSUB    dYi0,dXi0,dXi1
+            VHADD    dYr1,dXr0,dXr1
+            VHADD    dYi1,dXi0,dXi1
+
+        .ELSE
+
+            VSUB    dYr0,dXr0,dXr1
+            VSUB    dYi0,dXi0,dXi1
+            VADD    dYr1,dXr0,dXr1
+            VADD    dYi1,dXi0,dXi1
+
+
+        .ENDIF
+
+        VST2    {dYr0[0],dYi0[0]},[pDst]!
+        VST2    {dYr0[1],dYi0[1]},[pDst],step               @// step = -4+outPointStep
+
+        VST2    {dYr1[0],dYi1[0]},[pDst]!
+        VST2    {dYr1[1],dYi1[1]},[pDst],dstStep            @// dstStep = -4-outPointStep+8 = -step
+
+        @//VST2    {dYr0,dYi0},[pDst],outPointStep
+        @//VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
@@ -0,0 +1,216 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6740
+@// Last Modified Date:       Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define twStep                          r8
+#define pTmp                            r4
+
+@// Neon Registers
+
+#define dW1S32                          D0.S32
+#define dW2S32                          D1.S32
+#define dW1                             D0.S16
+#define dW2                             D1.S16
+
+#define dX0                             D2.S16
+#define dX1                             D3.S16
+#define dX2                             D4.S16
+#define dX3                             D5.S16
+#define dY0                             D6.S16
+#define dY1                             D7.S16
+#define dY2                             D8.S16
+#define dY3                             D9.S16
+#define qT0                             Q5.S32
+#define qT1                             Q6.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+
+        LSL     grpCount,subFFTSize,#1
+
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,subFFTNum
+        MOV     twStep,subFFTNum,LSL #1
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+
+
+        RSB      dstStep,outPointStep,#8
+
+
+        @// Note: pointStep is 8 in this case: so need of extra reg
+        @// Loop on the groups: 2 groups at a time
+
+grpLoop\name:
+
+        VLD1     dW1S32[],[pTwiddle],twStep                @//[wi | wr]
+        VLD1     dW2S32[],[pTwiddle],twStep
+
+        @// Process the sets for each grp:  2 sets at a time (no set looping required)
+
+        VLD1    dX0,[pSrc]!            @// point0: of set0,set1 of grp0
+        VLD1    dX1,[pSrc]!            @// point1: of set0,set1 of grp0
+        VLD1    dX2,[pSrc]!            @// point0: of set0,set1 of grp1
+        VLD1    dX3,[pSrc]!            @// point1: of set0,set1 of grp1
+
+        SUBS    grpCount,grpCount,#4              @// decrement the loop counter
+        VUZP    dW1,dW2
+        VUZP    dX1,dX3
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dX1,dW1
+            VMLAL   qT0,dX3,dW2                       @// real part
+            VMULL   qT1,dX3,dW1
+            VMLSL   qT1,dX1,dW2                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dX1,dW1
+            VMLSL   qT0,dX3,dW2                       @// real part
+            VMULL   qT1,dX3,dW1
+            VMLAL   qT1,dX1,dW2                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dX1,qT0,#15
+        VRSHRN  dX3,qT1,#15
+
+        VZIP    dX1,dX3
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            VHSUB    dY0,dX0,dX1
+            VHADD    dY1,dX0,dX1
+            VHSUB    dY2,dX2,dX3
+            VHADD    dY3,dX2,dX3
+
+        .ELSE
+
+            VSUB    dY0,dX0,dX1
+            VADD    dY1,dX0,dX1
+            VSUB    dY2,dX2,dX3
+            VADD    dY3,dX2,dX3
+
+
+
+        .ENDIF
+
+        VST1    dY0,[pDst],outPointStep             @// point0: of set0,set1 of grp0
+        VST1    dY1,[pDst],dstStep                  @// dstStep = -outPointStep + 8
+        VST1    dY2,[pDst],outPointStep             @// point0: of set0,set1 of grp1
+        VST1    dY3,[pDst],dstStep                  @// point1: of set0,set1 of grp1
+
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
@@ -0,0 +1,219 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   5892
+@// Last Modified Date:       Thu, 07 Jun 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+    @// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define pointStep                       r4
+#define grpCount                        r5
+#define setCount                        r8
+#define step                            r10
+#define dstStep                         r11
+#define pTmp                            r9
+
+@// Neon Registers
+
+#define dW                              D0.S16
+#define dX0                             D2.S16
+#define dX1                             D3.S16
+#define dX2                             D4.S16
+#define dX3                             D5.S16
+#define dY0                             D6.S16
+#define dY1                             D7.S16
+#define dY2                             D8.S16
+#define dY3                             D9.S16
+#define qT0                             Q3.S32
+#define qT1                             Q4.S32
+
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+        LSL     grpCount,subFFTSize,#1
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #1
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#1
+
+
+        RSB      step,pointStep,#16
+        RSB      dstStep,outPointStep,#16
+
+        @// Loop on the groups
+
+grpLoop\name:
+
+        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
+        MOV      setCount,pointStep,LSR #2
+
+
+        @// Loop on the sets: 4 at a time
+
+
+setLoop\name:
+
+
+        VLD2    {dX0,dX1},[pSrc],pointStep            @// point0: dX0-real part dX1-img part
+        VLD2    {dX2,dX3},[pSrc],step                 @// point1: dX2-real part dX3-img part
+
+        SUBS    setCount,setCount,#4
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dX2,dW[0]
+            VMLAL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLSL   qT1,dX2,dW[1]                       @// imag part
+
+        .ELSE
+
+            VMULL   qT0,dX2,dW[0]
+            VMLSL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLAL   qT1,dX2,dW[1]                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dX2,qT0,#15
+        VRSHRN  dX3,qT1,#15
+
+        .ifeqs "\scaled", "TRUE"
+            VHSUB    dY0,dX0,dX2
+            VHSUB    dY1,dX1,dX3
+            VHADD    dY2,dX0,dX2
+            VHADD    dY3,dX1,dX3
+
+        .ELSE
+            VSUB    dY0,dX0,dX2
+            VSUB    dY1,dX1,dX3
+            VADD    dY2,dX0,dX2
+            VADD    dY3,dX1,dX3
+
+        .ENDIF
+
+        VST2    {dY0,dY1},[pDst],outPointStep
+        VST2    {dY2,dY3},[pDst],dstStep              @// dstStep = -outPointStep + 16
+
+        BGT     setLoop\name
+
+        SUBS    grpCount,grpCount,#2
+        ADD     pSrc,pSrc,pointStep
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
@@ -0,0 +1,314 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7761
+@// Last Modified Date:       Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define pPingPongBuf                    r5
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize                         r3
+@// Reuse grpSize as setCount
+#define setCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r4
+#define setStep                         r8
+#define step1                           r9
+#define step3                           r10
+
+@// Neon Registers
+
+#define dXr0                            D0.S16
+#define dXi0                            D1.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+#define dYr0                            D8.S16
+#define dYi0                            D9.S16
+#define dYr1                            D10.S16
+#define dYi1                            D11.S16
+#define dYr2                            D12.S16
+#define dYi2                            D13.S16
+#define dYr3                            D14.S16
+#define dYi3                            D15.S16
+#define dZr0                            D16.S16
+#define dZi0                            D17.S16
+#define dZr1                            D18.S16
+#define dZi1                            D19.S16
+#define dZr2                            D20.S16
+#define dZi2                            D21.S16
+#define dZr3                            D22.S16
+#define dZi3                            D23.S16
+#define qY0                             Q4.S16
+#define qY2                             Q6.S16
+#define qX0                             Q0.S16
+#define qX2                             Q2.S16
+
+#define qY1                             Q5.S16
+#define qY3                             Q7.S16
+#define qX1                             Q1.S16
+#define qX3                             Q3.S16
+#define qZ0                             Q8.S16
+#define qZ1                             Q9.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+        MOV     pointStep,subFFTNum
+        @// Update pSubFFTSize and pSubFFTNum regs
+
+
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        MOV     subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #3
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        MOV     step1,setStep
+        ADD     setStep,setStep,pointStep             @// setStep = 3*pointStep
+        RSB     setStep,setStep,#16                   @// setStep = - 3*pointStep+16
+
+
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3]
+        MOV     subFFTSize,#4                         @// subFFTSize = 1 for the first stage
+
+
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    qY0,qX0,qX2             @// u0
+        .ELSE
+            VADD   qY0,qX0,qX2               @// u0
+        .ENDIF
+        RSB     step3,pointStep,#0
+
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets: 4 sets at a time
+
+grpZeroSetLoop\name:
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+
+            VHSUB    qY2,qX0,qX2             @// u1
+            SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VHADD    qY1,qX1,qX3             @// u2
+            VLD2    {dXr2,dXi2},[pSrc :128],step3
+            VHSUB    qY3,qX1,qX3             @// u3
+
+
+
+            @// finish second stage of 4 point FFT
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VHADD    qZ0,qY0,qY1             @// y0
+
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    dZr3,dYr2,dYi3                  @// y3
+                VHADD    dZi3,dYi2,dYr3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+
+                VHSUB    qZ1,qY0,qY1                     @// y2
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VHADD    dZr2,dYr2,dYi3                  @// y1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+
+                VHADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+            .ELSE
+
+                VHADD    dZr2,dYr2,dYi3                  @// y1
+                VHSUB    dZi2,dYi2,dYr3
+
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    qZ1,qY0,qY1                     @// y2
+
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VHSUB    dZr3,dYr2,dYi3                  @// y3
+                VHADD    dZi3,dYi2,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+
+            .ENDIF
+
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+
+            VSUB    qY2,qX0,qX2             @// u1
+            SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VADD    qY1,qX1,qX3             @// u2
+            VLD2    {dXr2,dXi2},[pSrc :128],step3
+            VSUB    qY3,qX1,qX3             @// u3
+
+
+
+            @// finish second stage of 4 point FFT
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VADD    qZ0,qY0,qY1             @// y0
+
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    dZr3,dYr2,dYi3                  @// y3
+                VADD    dZi3,dYi2,dYr3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+
+                VSUB    qZ1,qY0,qY1                     @// y2
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VADD    dZr2,dYr2,dYi3                  @// y1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VSUB    dZi2,dYi2,dYr3
+
+                VADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+            .ELSE
+
+                VADD    dZr2,dYr2,dYi3                  @// y1
+                VSUB    dZi2,dYi2,dYr3
+
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    qZ1,qY0,qY1                     @// y2
+
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VSUB    dZr3,dYr2,dYi3                  @// y3
+                VADD    dZi3,dYi2,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+
+            .ENDIF
+
+
+        .ENDIF
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= grpSize
+        MOV     pDst,pPingPongBuf
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
@@ -0,0 +1,410 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7765
+@// Last Modified Date:       Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+    @//IMPORT  armAAC_constTable
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define pw1                             r8
+#define pw2                             r9
+#define pw3                             r10
+#define pTmp                            r4
+
+
+@// Neon Registers
+
+#define dButterfly1Real02               D0.S16
+#define dButterfly1Imag02               D1.S16
+#define dButterfly1Real13               D2.S16
+#define dButterfly1Imag13               D3.S16
+#define dButterfly2Real02               D4.S16
+#define dButterfly2Imag02               D5.S16
+#define dButterfly2Real13               D6.S16
+#define dButterfly2Imag13               D7.S16
+#define dXr0                            D0.S16
+#define dXi0                            D1.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+
+#define dW1rS32                         D8.S32
+#define dW1iS32                         D9.S32
+#define dW2rS32                         D10.S32
+#define dW2iS32                         D11.S32
+#define dW3rS32                         D12.S32
+#define dW3iS32                         D13.S32
+
+#define dW1r                            D8.S16
+#define dW1i                            D9.S16
+#define dW2r                            D10.S16
+#define dW2i                            D11.S16
+#define dW3r                            D12.S16
+#define dW3i                            D13.S16
+
+#define dTmp0                           D12.S16
+#define dTmp1                           D13.S16
+#define dTmp1S32                        D13.S32
+#define dTmp2S32                        D14.S32
+#define dTmp3S32                        D15.S32
+
+#define dYr0                            D18.S16
+#define dYi0                            D19.S16
+#define dYr1                            D16.S16
+#define dYi1                            D17.S16
+#define dYr2                            D20.S16
+#define dYi2                            D21.S16
+#define dYr3                            D14.S16
+#define dYi3                            D15.S16
+#define qY0                             Q9.S16
+#define qY1                             Q8.S16
+#define qY2                             Q10.S16
+#define qY3                             Q7.S16
+
+#define qX0                             Q0.S16
+#define qX1                             Q1.S16
+#define qX2                             Q2.S16
+#define qX3                             Q3.S16
+
+#define qT0                             Q9.S32
+#define qT1                             Q10.S32
+#define qT2                             Q7.S32
+#define qT3                             Q8.S32
+
+#define dZr0                            D22.S16
+#define dZi0                            D23.S16
+#define dZr1                            D24.S16
+#define dZi1                            D25.S16
+#define dZr2                            D26.S16
+#define dZi2                            D27.S16
+#define dZr3                            D28.S16
+#define dZi3                            D29.S16
+
+#define qZ0                             Q11.S16
+#define qZ1                             Q12.S16
+#define qZ2                             Q13.S16
+#define qZ3                             Q14.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+        MOV     pw2,pTwiddle
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
+        MOV     pw3,pTwiddle
+        MOV     pw1,pTwiddle
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes
+        MOV     outPointStep,subFFTSize,LSL #2
+
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#2
+
+
+        @// Update grpCount and grpSize rightaway
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        MOV     dstStep,outPointStep,LSL #1
+
+        VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
+
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+        @// Process 4 groups at a time
+
+grpLoop\name:
+
+
+        @// Rearrange the third twiddle
+        VUZP    dW3r,dW3i
+        SUBS    grpCount,grpCount,#16                    @// grpCount is multiplied by 4
+
+
+        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
+        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
+        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
+        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
+
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr1,dW1r
+            VMLAL   qT0,dXi1,dW1i                       @// real part
+            VMULL   qT1,dXi1,dW1r
+            VMLSL   qT1,dXr1,dW1i                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr1,dW1r
+            VMLSL   qT0,dXi1,dW1i                       @// real part
+            VMULL   qT1,dXi1,dW1r
+            VMLAL   qT1,dXr1,dW1i                       @// imag part
+
+        .ENDIF
+
+        @// Load the first twiddle for 4 groups : w^1
+        @// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
+
+        VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT2,dXr2,dW2r
+            VMLAL   qT2,dXi2,dW2i                       @// real part
+            VMULL   qT3,dXi2,dW2r
+            VMLSL   qT3,dXr2,dW2i                       @// imag part
+
+        .ELSE
+            VMULL   qT2,dXr2,dW2r
+            VMLSL   qT2,dXi2,dW2i                       @// real part
+            VMULL   qT3,dXi2,dW2r
+            VMLAL   qT3,dXr2,dW2i                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dZr1,qT0,#15
+        VRSHRN  dZi1,qT1,#15
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr3,dW3r
+            VMLAL   qT0,dXi3,dW3i                       @// real part
+            VMULL   qT1,dXi3,dW3r
+            VMLSL   qT1,dXr3,dW3i                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr3,dW3r
+            VMLSL   qT0,dXi3,dW3i                       @// real part
+            VMULL   qT1,dXi3,dW3r
+            VMLAL   qT1,dXr3,dW3i                       @// imag part
+
+        .ENDIF
+
+        @// Load the second twiddle for 4 groups : w^2
+        @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
+
+        VRSHRN  dZr2,qT2,#15
+        VRSHRN  dZi2,qT3,#15
+
+        @// Load the third twiddle for 4 groups : w^3
+        @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
+
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+
+        VRSHRN  dZr3,qT0,#15
+        VRSHRN  dZi3,qT1,#15
+
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+
+            VHADD    qY0,qX0,qZ2
+            VHSUB    qY2,qX0,qZ2
+            VHADD    qY1,qZ1,qZ3
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+            VHSUB    qY3,qZ1,qZ3
+
+            @// finish second stage of 4 point FFT
+
+            VHSUB    qZ0,qY2,qY1
+            VHADD    qZ2,qY2,qY1
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+
+            .ifeqs "\inverse", "TRUE"
+
+                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+
+                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VHADD    dZi1,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ELSE
+
+                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VHADD    dZi1,dYi0,dYr3
+
+                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ENDIF
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+
+            VADD    qY0,qX0,qZ2
+            VSUB    qY2,qX0,qZ2
+            VADD    qY1,qZ1,qZ3
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+            VSUB    qY3,qZ1,qZ3
+
+            @// finish second stage of 4 point FFT
+
+            VSUB    qZ0,qY2,qY1
+            VADD    qZ2,qY2,qY1
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+
+            .ifeqs "\inverse", "TRUE"
+
+                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+
+                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ELSE
+
+                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+
+                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ENDIF
+
+
+
+
+        .ENDIF
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pSrc,pSrc,#64                       @// Extra increment currently done in the loop
+        SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
@@ -0,0 +1,400 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7761
+@// Last Modified Date:       Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+
+    @// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r5
+#define stepTwiddle                     r12
+#define setCount                        r14
+#define srcStep                         r8
+#define setStep                         r9
+#define dstStep                         r10
+#define twStep                          r11
+#define t1                              r3
+
+@// Neon Registers
+
+#define dW1                             D0.S16
+#define dW2                             D1.S16
+#define dW3                             D2.S16
+
+#define dXr0                            D4.S16
+#define dXi0                            D5.S16
+#define dXr1                            D6.S16
+#define dXi1                            D7.S16
+#define dXr2                            D8.S16
+#define dXi2                            D9.S16
+#define dXr3                            D10.S16
+#define dXi3                            D11.S16
+#define dYr0                            D12.S16
+#define dYi0                            D13.S16
+#define dYr1                            D14.S16
+#define dYi1                            D15.S16
+#define dYr2                            D16.S16
+#define dYi2                            D17.S16
+#define dYr3                            D18.S16
+#define dYi3                            D19.S16
+#define qT0                             Q8.S32
+#define qT1                             Q9.S32
+#define qT2                             Q6.S32
+#define qT3                             Q7.S32
+
+#define dZr0                            D20.S16
+#define dZi0                            D21.S16
+#define dZr1                            D22.S16
+#define dZi1                            D23.S16
+#define dZr2                            D24.S16
+#define dZi2                            D25.S16
+#define dZr3                            D26.S16
+#define dZi3                            D27.S16
+#define qY0                             Q6.S16
+#define qY1                             Q7.S16
+#define qY2                             Q8.S16
+#define qY3                             Q9.S16
+#define qX0                             Q2.S16
+#define qZ0                             Q10.S16
+#define qZ1                             Q11.S16
+#define qZ2                             Q12.S16
+#define qZ3                             Q13.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+
+        @// pOut0+1 increments pOut0 by 4 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
+
+        MOV     stepTwiddle,#0
+        SMULBB  outPointStep,grpCount,subFFTNum
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+
+        LSL     pointStep,subFFTNum,#2                      @// 2*grpSize
+
+        VLD1     dW1,[pTwiddle :64]                             @//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1                    @// srcStep = 2*pointStep
+        VLD1     dW2,[pTwiddle :64]                             @//[wi | wr]
+        ADD     setStep,srcStep,pointStep                   @// setStep = 3*pointStep
+        SUB     srcStep,srcStep,#16                         @// srcStep = 2*pointStep-16
+        VLD1     dW3,[pTwiddle :64]
+        @//RSB     setStep,setStep,#16                      @// setStep = - 3*pointStep+16
+        RSB     setStep,setStep,#0                          @// setStep = - 3*pointStep
+
+        MOV     dstStep,outPointStep,LSL #1
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
+
+
+
+grpLoop\name:
+
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        ADD      pTwiddle,pTwiddle,stepTwiddle               @// set pTwiddle to the first point
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        MOV      twStep,stepTwiddle,LSL #2
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & reset pSrc
+
+        SUB      twStep,stepTwiddle,twStep                   @// twStep = -3*stepTwiddle
+
+
+        MOV      setCount,pointStep,LSR #2
+        ADD     pSrc,pSrc,#16                         @// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,pointStep                   @// increment to data[1] of the next set
+
+        @// Loop on the sets : 4 at a time
+
+setLoop\name:
+
+        SUBS    setCount,setCount,#4                    @// decrement the loop counter
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr1,dW1[0]
+            VMLAL   qT0,dXi1,dW1[1]                       @// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLSL   qT1,dXr1,dW1[1]                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr1,dW1[0]
+            VMLSL   qT0,dXi1,dW1[1]                       @// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLAL   qT1,dXr1,dW1[1]                       @// imag part
+
+        .ENDIF
+
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT2,dXr2,dW2[0]
+            VMLAL   qT2,dXi2,dW2[1]                       @// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLSL   qT3,dXr2,dW2[1]                       @// imag part
+
+        .ELSE
+            VMULL   qT2,dXr2,dW2[0]
+            VMLSL   qT2,dXi2,dW2[1]                       @// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLAL   qT3,dXr2,dW2[1]                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dZr1,qT0,#15
+        VRSHRN  dZi1,qT1,#15
+
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr3,dW3[0]
+            VMLAL   qT0,dXi3,dW3[1]                       @// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLSL   qT1,dXr3,dW3[1]                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr3,dW3[0]
+            VMLSL   qT0,dXi3,dW3[1]                       @// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLAL   qT1,dXr3,dW3[1]                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dZr2,qT2,#15
+        VRSHRN  dZi2,qT3,#15
+
+
+        VRSHRN  dZr3,qT0,#15
+        VRSHRN  dZi3,qT1,#15
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+            VHADD    qY0,qX0,qZ2
+            VHSUB    qY2,qX0,qZ2
+
+            VLD2    {dXr0,dXi0},[pSrc :128]!          @//  data[0]
+            VHADD    qY1,qZ1,qZ3
+            VHSUB    qY3,qZ1,qZ3
+
+
+            @// finish second stage of 4 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    qZ0,qY2,qY1
+
+                VHADD    dZr2,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi0,dYr3
+
+                VHADD    qZ1,qY2,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
+                VHSUB    dZr3,dYr0,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHADD    dZi3,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+            .ELSE
+
+                VHSUB    qZ0,qY2,qY1
+
+                VHSUB    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHADD    dZi3,dYi0,dYr3
+
+                VHADD    qZ1,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VHADD    dZr2,dYr0,dYi3
+                VHSUB    dZi2,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],dstStep
+
+
+            .ENDIF
+
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+            VADD    qY0,qX0,qZ2
+            VSUB    qY2,qX0,qZ2
+
+            VLD2    {dXr0,dXi0},[pSrc]!          @//  data[0]
+            VADD    qY1,qZ1,qZ3
+            VSUB    qY3,qZ1,qZ3
+
+
+            @// finish second stage of 4 point FFT
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    qZ0,qY2,qY1
+
+                VADD    dZr2,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi2,dYi0,dYr3
+
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+            .ELSE
+
+                VSUB    qZ0,qY2,qY1
+
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VADD    dZr2,dYr0,dYi3
+                VSUB    dZi2,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],dstStep
+
+
+            .ENDIF
+
+
+
+        .ENDIF
+
+        ADD     pSrc,pSrc,pointStep                         @// increment to data[1] of the next set
+        BGT     setLoop\name
+
+        VLD1     dW1,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
+        SUBS    grpCount,grpCount,#4                        @// subtract 4 since grpCount multiplied by 4
+        VLD1     dW2,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
+        ADD     pSrc,pSrc,srcStep                           @// increment pSrc for the next grp
+        VLD1     dW3,[pTwiddle :64],twStep                      @//[wi | wr]
+
+
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #2           @// pDst -= size; pSrc -= 4*size bytes
+        SUB     pSrc,t1,outPointStep
+
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
@@ -0,0 +1,619 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7766
+@// Last Modified Date:       Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+@// dest buffer for the next stage (not pSrc for first stage)
+#define pPingPongBuf                    r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize                         r3
+@// Reuse grpSize as setCount
+#define setCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r4
+#define setStep                         r8
+#define step1                           r9
+#define step2                           r10
+#define t0                              r11
+
+
+@// Neon Registers
+
+#define dXr0                            D14.S16
+#define dXi0                            D15.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+#define dXr4                            D8.S16
+#define dXi4                            D9.S16
+#define dXr5                            D10.S16
+#define dXi5                            D11.S16
+#define dXr6                            D12.S16
+#define dXi6                            D13.S16
+#define dXr7                            D0.S16
+#define dXi7                            D1.S16
+#define qX0                             Q7.S16
+#define qX1                             Q1.S16
+#define qX2                             Q2.S16
+#define qX3                             Q3.S16
+#define qX4                             Q4.S16
+#define qX5                             Q5.S16
+#define qX6                             Q6.S16
+#define qX7                             Q0.S16
+
+#define dUr0                            D16.S16
+#define dUi0                            D17.S16
+#define dUr2                            D18.S16
+#define dUi2                            D19.S16
+#define dUr4                            D20.S16
+#define dUi4                            D21.S16
+#define dUr6                            D22.S16
+#define dUi6                            D23.S16
+#define dUr1                            D24.S16
+#define dUi1                            D25.S16
+#define dUr3                            D26.S16
+#define dUi3                            D27.S16
+#define dUr5                            D28.S16
+#define dUi5                            D29.S16
+@// reuse dXr7 and dXi7
+#define dUr7                            D30.S16
+#define dUi7                            D31.S16
+#define qU0                             Q8.S16
+#define qU1                             Q12.S16
+#define qU2                             Q9.S16
+#define qU3                             Q13.S16
+#define qU4                             Q10.S16
+#define qU5                             Q14.S16
+#define qU6                             Q11.S16
+#define qU7                             Q15.S16
+
+
+
+#define dVr0                            D24.S16
+#define dVi0                            D25.S16
+#define dVr2                            D26.S16
+#define dVi2                            D27.S16
+#define dVr4                            D28.S16
+#define dVi4                            D29.S16
+#define dVr6                            D30.S16
+#define dVi6                            D31.S16
+#define dVr1                            D16.S16
+#define dVi1                            D17.S16
+#define dVr3                            D18.S16
+#define dVi3                            D19.S16
+#define dVr5                            D20.S16
+#define dVi5                            D21.S16
+@// reuse dUi7
+#define dVr7                            D22.S16
+@// reuse dUr7
+#define dVi7                            D23.S16
+#define qV0                             Q12.S16
+#define qV1                             Q8.S16
+#define qV2                             Q13.S16
+#define qV3                             Q9.S16
+#define qV4                             Q14.S16
+#define qV5                             Q10.S16
+#define qV6                             Q15.S16
+#define qV7                             Q11.S16
+
+
+
+#define dYr0                            D16.S16
+#define dYi0                            D17.S16
+#define dYr2                            D18.S16
+#define dYi2                            D19.S16
+#define dYr4                            D20.S16
+#define dYi4                            D21.S16
+#define dYr6                            D22.S16
+#define dYi6                            D23.S16
+#define dYr1                            D24.S16
+#define dYi1                            D25.S16
+#define dYr3                            D26.S16
+#define dYi3                            D27.S16
+#define dYr5                            D28.S16
+#define dYi5                            D29.S16
+@// reuse dYr4 and dYi4
+#define dYr7                            D30.S16
+#define dYi7                            D31.S16
+#define qY0                             Q8.S16
+#define qY1                             Q12.S16
+#define qY2                             Q9.S16
+#define qY3                             Q13.S16
+#define qY4                             Q10.S16
+#define qY5                             Q14.S16
+#define qY6                             Q11.S16
+#define qY7                             Q15.S16
+
+
+#define dT0                             D0.S16
+#define dT1                             D1.S16
+
+
+@// Define constants
+        .set   ONEBYSQRT2, 0x00005A82        @// Q15 format
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+        @// Update pSubFFTSize and pSubFFTNum regs
+        MOV     subFFTSize,#8                               @// subFFTSize = 1 for the first stage
+        LDR     t0,=ONEBYSQRT2                              @// t0=(1/sqrt(2)) as Q15 format
+
+        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
+        MOV     subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
+        @// Note: outPointStep = pointStep for firststage
+
+        MOV     pointStep,grpSize,LSL #2
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        MOV     step1,grpSize,LSL #3
+
+        MOV     step2,pointStep,LSL #3
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        SUB     step2,step2,pointStep                          @// step2 = 7*pointStep
+        RSB     setStep,step2,#16                              @// setStep = - 7*pointStep+16
+
+
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+        VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+        VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+        VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+        VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7] & update pSrc for the next set
+                                                      @//  setStep = -7*pointStep + 16
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets : 4 sets at a time
+
+grpZeroSetLoop\name:
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+
+        .ifeqs "\scaled", "TRUE"
+            @// finish first stage of 8 point FFT
+
+            VHADD    qU0,qX0,qX4
+            VHADD    qU2,qX1,qX5
+            VHADD    qU4,qX2,qX6
+            VHADD    qU6,qX3,qX7
+
+            @// finish second stage of 8 point FFT
+
+            VHADD    qV0,qU0,qU4
+            VHSUB    qV2,qU0,qU4
+            VHADD    qV4,qU2,qU6
+            VHSUB    qV6,qU2,qU6
+
+            @// finish third stage of 8 point FFT
+
+            VHADD    qY0,qV0,qV4
+            VHSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    dYr2,dVr2,dVi6
+                VHADD    dYi2,dVi2,dVr6
+
+                VHADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VHSUB    dYi6,dVi2,dVr6
+
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+
+            .ELSE
+
+                VHADD    dYr6,dVr2,dVi6
+                VHSUB    dYi6,dVi2,dVr6
+
+                VHSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VHADD    dYi2,dVi2,dVr6
+
+
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+
+            .ENDIF
+
+            @// finish first stage of 8 point FFT
+
+            VHSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0
+
+            @// finish second stage of 8 point FFT
+
+            VHSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
+            VHADD    dVi1,dUi1,dUr5
+            VHADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+            VHSUB    dVi3,dUi1,dUr5
+
+            VHSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+            VHADD    dVi5,dUi3,dUr7
+            VHADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+            VHSUB    dVi7,dUi3,dUr7
+
+            @// finish third stage of 8 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VHADD    qY1,qV1,qV5
+                VHSUB    qY5,qV1,qV5
+
+
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+
+                VHSUB    dYr3,dVr3,dVr7
+                VHSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+
+
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+#endif
+            .ELSE
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+                VHSUB    qY5,qV1,qV5
+
+                VHSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
+                VHSUB    dYi3,dVi3,dVi7
+                VHADD    qY1,qV1,qV5
+
+
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+#endif
+
+            .ENDIF
+
+
+
+        .ELSE
+            @// finish first stage of 8 point FFT
+
+            VADD    qU0,qX0,qX4
+            VADD    qU2,qX1,qX5
+            VADD    qU4,qX2,qX6
+            VADD    qU6,qX3,qX7
+
+            @// finish second stage of 8 point FFT
+
+            VADD    qV0,qU0,qU4
+            VSUB    qV2,qU0,qU4
+            VADD    qV4,qU2,qU6
+            VSUB    qV6,qU2,qU6
+
+            @// finish third stage of 8 point FFT
+
+            VADD    qY0,qV0,qV4
+            VSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    dYr2,dVr2,dVi6
+                VADD    dYi2,dVi2,dVr6
+
+                VADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VSUB    dYi6,dVi2,dVr6
+
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+
+            .ELSE
+
+                VADD    dYr6,dVr2,dVi6
+                VSUB    dYi6,dVi2,dVr6
+
+                VSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VADD    dYi2,dVi2,dVr6
+
+
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+
+            .ENDIF
+
+            @// finish first stage of 8 point FFT
+
+            VSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0
+
+            @// finish second stage of 8 point FFT
+
+            VSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
+            VADD    dVi1,dUi1,dUr5
+            VADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+            VSUB    dVi3,dUi1,dUr5
+
+            VSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+            VADD    dVi5,dUi3,dUr7
+            VADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+            VSUB    dVi7,dUi3,dUr7
+
+            @// finish third stage of 8 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VADD    qY1,qV1,qV5
+                VSUB    qY5,qV1,qV5
+
+
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+
+                VSUB    dYr3,dVr3,dVr7
+                VSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+
+
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+#endif
+            .ELSE
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+                VSUB    qY5,qV1,qV5
+
+                VSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
+                VSUB    dYi3,dVi3,dVi7
+                VADD    qY1,qV1,qV5
+
+
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+#endif
+
+            .ENDIF
+
+
+        .ENDIF
+
+        SUB     pDst, pDst, step2                               @// update pDst for the next set
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                             @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+
+
+        .endm
+
+
+        @// Allocate stack memory required by the function
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
@@ -0,0 +1,163 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   5995
+@// Last Modified Date:       Fri, 08 Jun 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT 
+@// stage for a N point complex signal.
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+            
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define pPingPongBuf	r5
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep	r3
+#define outPointStep	r3
+#define grpSize		r4
+#define setCount	r4
+#define step		r8
+#define dstStep		r8
+
+@// Neon Registers
+
+#define dX0	D0.S32
+#define dX1	D1.S32
+#define dY0	D2.S32
+#define dY1	D3.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Define stack arguments
+        
+        
+        @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+        
+        
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1  
+        MOV        subFFTNum,grpSize 
+        
+        
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+        
+        MOV        pointStep,grpSize,LSL #3
+        RSB        step,pointStep,#8 
+        
+        
+        @// Loop on the sets for grp zero
+
+grpZeroSetLoop\name :	
+        
+        VLD1    dX0,[pSrc],pointStep
+        VLD1    dX1,[pSrc],step                   @// step = -pointStep + 8
+        SUBS    setCount,setCount,#1              @// decrement the loop counter
+        
+        .ifeqs "\scaled", "TRUE"
+        
+            VHADD    dY0,dX0,dX1
+            VHSUB    dY1,dX0,dX1
+        
+        .ELSE
+        
+            VADD    dY0,dX0,dX1
+            VSUB    dY1,dX0,dX1
+        
+         
+        .ENDIF
+        
+        VST1    dY0,[pDst],outPointStep
+        VST1    dY1,[pDst],dstStep                  @// dstStep =  step = -pointStep + 8
+               
+        BGT     grpZeroSetLoop\name
+        
+        
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize 
+        MOV     pDst,pPingPongBuf
+                
+        .endm
+        
+        
+                
+        M_START armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",fwdsfs
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",invsfs
+        M_END
+
+	.end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
@@ -0,0 +1,184 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7493
+@// Last Modified Date:       Mon, 24 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+@// stage for a N point complex signal.
+@// 
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+            
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep	r3
+#define grpCount	r4
+#define dstStep		r5
+#define pTmp		r4
+
+@// Neon Registers
+
+#define dWr	D0.S32
+#define dWi	d1.s32
+#define dXr0	d2.s32
+#define dXi0	d3.s32
+#define dXr1	d4.s32
+#define dXi1	d5.s32
+#define dYr0	d6.s32
+#define dYi0	d7.s32
+#define dYr1	d8.s32
+#define dYi1	d9.s32
+#define qT0	q5.s64
+#define qT1	q6.s64
+	
+        .macro FFTSTAGE scaled, inverse, name
+        
+        
+        MOV     outPointStep,subFFTSize,LSL #3
+        @// Update grpCount and grpSize rightaway 
+        
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#1
+        
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+                               
+        RSB      dstStep,outPointStep,#16
+        
+        
+        @// Loop on 2 grps at a time for the last stage
+
+grpLoop\name :	
+        VLD2    {dWr,dWi},[pTwiddle :64]!
+        
+        VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
+        SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2 
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dWr,dXr1
+            VMLAL   qT0,dWi,dXi1                       @// real part
+            VMULL   qT1,dWr,dXi1
+            VMLSL   qT1,dWi,dXr1                       @// imag part
+                
+        .else
+        
+            VMULL   qT0,dWr,dXr1
+            VMLSL   qT0,dWi,dXi1                       @// real part
+            VMULL   qT1,dWr,dXi1
+            VMLAL   qT1,dWi,dXr1                       @// imag part
+                    
+        .endif
+        
+        VRSHRN  dXr1,qT0,#31
+        VRSHRN  dXi1,qT1,#31
+        
+                
+        .ifeqs "\scaled", "TRUE"
+        
+            VHSUB    dYr0,dXr0,dXr1
+            VHSUB    dYi0,dXi0,dXi1
+            VHADD    dYr1,dXr0,dXr1
+            VHADD    dYi1,dXi0,dXi1
+            
+        .else
+        
+            VSUB    dYr0,dXr0,dXr1
+            VSUB    dYi0,dXi0,dXi1
+            VADD    dYr1,dXr0,dXr1
+            VADD    dYi1,dXi0,dXi1
+            
+         
+        .endif
+        
+        VST2    {dYr0,dYi0},[pDst],outPointStep
+        VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
+               
+        bgt     grpLoop\name
+        
+        
+        @// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 4*size; pSrc -= 8*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 4*size bytes
+                
+        .endm
+        
+        
+                
+        M_START armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe,r4,""
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",fwdsfs
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",invsfs
+        M_END
+	
+	.end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S
@@ -0,0 +1,216 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@//
+@// File Name:  armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   5638
+@// Last Modified Date:       Wed, 06 Jun 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a Radix 2 DIT in-order out-of-place FFT stage for a N point complex signal.
+@// This handle the general stage, not the first or last stage.
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+
+           
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+    
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep	r3
+#define pointStep	r4
+#define grpCount	r5
+#define setCount	r8
+@//const           RN  9
+#define step		r10
+#define dstStep		r11
+#define pTable		r9
+#define pTmp		r9    
+
+@// Neon Registers
+
+#define dW	D0.S32
+#define dX0	D2.S32
+#define dX1	D3.S32
+#define dX2	D4.S32
+#define dX3	D5.S32
+#define dY0	D6.S32
+#define dY1	D7.S32
+#define dY2	D8.S32
+#define dY3	D9.S32
+#define qT0	Q3.S64
+#define qT1	Q4.S64
+
+    
+    
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Define stack arguments
+        
+        
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+        
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+        LSL     grpCount,subFFTSize,#1
+        
+        
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #2
+        
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes = 4*size bytes
+        SMULBB  outPointStep,grpCount,pointStep  
+        LSL     pointStep,pointStep,#1    
+                               
+        
+        RSB      step,pointStep,#16
+        RSB      dstStep,outPointStep,#16
+        
+        @// Loop on the groups
+
+grpLoop\name :	        
+        MOV      setCount,pointStep,LSR #3
+        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
+        
+        
+        @// Loop on the sets
+        
+        
+setLoop\name :	       
+        
+        
+        VLD2    {dX0,dX1},[pSrc],pointStep            @// point0: dX0-real part dX1-img part
+        VLD2    {dX2,dX3},[pSrc],step                 @// point1: dX2-real part dX3-img part
+        
+        SUBS    setCount,setCount,#2               
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dX2,dW[0]
+            VMLAL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLSL   qT1,dX2,dW[1]                       @// imag part
+                
+        .else
+        
+            VMULL   qT0,dX2,dW[0]
+            VMLSL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLAL   qT1,dX2,dW[1]                       @// imag part
+                    
+        .endif
+        
+        VRSHRN  dX2,qT0,#31
+        VRSHRN  dX3,qT1,#31
+        
+        .ifeqs "\scaled", "TRUE"
+            VHSUB    dY0,dX0,dX2
+            VHSUB    dY1,dX1,dX3
+            VHADD    dY2,dX0,dX2
+            VHADD    dY3,dX1,dX3
+                
+        .else
+            VSUB    dY0,dX0,dX2
+            VSUB    dY1,dX1,dX3
+            VADD    dY2,dX0,dX2
+            VADD    dY3,dX1,dX3
+        
+        .endif
+        
+        VST2    {dY0,dY1},[pDst],outPointStep
+        VST2    {dY2,dY3},[pDst],dstStep              @// dstStep = -outPointStep + 16
+        
+        BGT     setLoop\name
+        
+        SUBS    grpCount,grpCount,#2               
+        ADD     pSrc,pSrc,pointStep
+        BGT     grpLoop\name    
+        
+        
+        @// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 4*size; pSrc -= 8*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 4*size bytes
+        
+        
+        .endm
+        
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+	.end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
@@ -0,0 +1,320 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7767
+@// Last Modified Date:       Thu, 27 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@// 
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define pPingPongBuf	r5
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize		r3
+@// Reuse grpSize as setCount
+#define setCount	r3
+#define pointStep	r4
+#define outPointStep	r4
+#define setStep		r8
+#define step1		r9
+#define step3		r10
+
+@// Neon Registers
+
+#define dXr0	D0.S32
+#define dXi0	D1.S32
+#define dXr1	D2.S32
+#define dXi1	D3.S32
+#define dXr2	D4.S32
+#define dXi2	D5.S32
+#define dXr3	D6.S32
+#define dXi3	D7.S32
+#define dYr0	D8.S32
+#define dYi0	D9.S32
+#define dYr1	D10.S32
+#define dYi1	D11.S32
+#define dYr2	D12.S32
+#define dYi2	D13.S32
+#define dYr3	D14.S32
+#define dYi3	D15.S32
+#define qX0	Q0.S32
+#define qX1	Q1.S32
+#define qX2	Q2.S32
+#define qX3	Q3.S32
+#define qY0	Q4.S32
+#define qY1	Q5.S32
+#define qY2	Q6.S32
+#define qY3	Q7.S32
+#define dZr0	D16.S32
+#define dZi0	D17.S32
+#define dZr1	D18.S32
+#define dZi1	D19.S32
+#define dZr2	D20.S32
+#define dZi2	D21.S32
+#define dZr3	D22.S32
+#define dZi3	D23.S32
+#define qZ0	Q8.S32
+#define qZ1	Q9.S32
+#define qZ2	Q10.S32
+#define qZ3	Q11.S32
+
+    
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Define stack arguments
+        
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        
+        MOV     pointStep,subFFTNum,LSL #1
+        
+        
+        @// Update pSubFFTSize and pSubFFTNum regs
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        MOV     subFFTSize,#4                                 @// subFFTSize = 1 for the first stage
+        
+        @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]  
+        MOV     subFFTNum,grpSize
+        
+                                       
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #4
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        ADD     setStep,setStep,pointStep                   @// setStep = 3*pointStep
+        RSB     setStep,setStep,#16                         @// setStep = - 3*pointStep+16
+        
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+        MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
+        
+        .ifeqs "\scaled", "TRUE"
+            VHADD    qY0,qX0,qX2
+        .else
+            VADD    qY0,qX0,qX2
+        .endif
+            
+        RSB     step3,pointStep,#0                          @// step3 = -pointStep                          
+        
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets : 2 sets at a time
+
+grpZeroSetLoop\name :	
+        
+        
+        
+        @// Decrement setcount
+        SUBS    setCount,setCount,#2                    @// decrement the set loop counter           
+        
+        .ifeqs "\scaled", "TRUE" 
+        
+            @// finish first stage of 4 point FFT 
+                        
+            VHSUB    qY2,qX0,qX2
+            
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VHADD    qY1,qX1,qX3
+            VLD2    {dXr2,dXi2},[pSrc :128],step3          @//  data[2]
+            VHSUB    qY3,qX1,qX3
+            
+                       
+            @// finish second stage of 4 point FFT 
+                                                
+            .ifeqs "\inverse", "TRUE"
+                   
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VHADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set    
+                VHSUB    dZr3,dYr2,dYi3
+                
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHADD    dZi3,dYi2,dYr3
+                
+                VHSUB    qZ1,qY0,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                
+                VHADD    dZr2,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+                
+                VHADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+                
+                
+            .else
+                
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VHADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+                VHADD    dZr2,dYr2,dYi3
+            
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+            
+                VHSUB    qZ1,qY0,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            
+                VHSUB    dZr3,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHADD    dZi3,dYi2,dYr3
+            
+                VHADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+            
+            .endif
+            
+        
+        
+        .else
+        
+            @// finish first stage of 4 point FFT 
+            
+            
+            VSUB    qY2,qX0,qX2
+            
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VADD    qY1,qX1,qX3
+            VLD2    {dXr2,dXi2},[pSrc :128],step3          @//  data[2]
+            VSUB    qY3,qX1,qX3
+            
+                       
+            @// finish second stage of 4 point FFT 
+                                                
+            .ifeqs "\inverse", "TRUE" 
+                   
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set    
+                VSUB    dZr3,dYr2,dYi3
+                
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VADD    dZi3,dYi2,dYr3
+                
+                VSUB    qZ1,qY0,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                
+                VADD    dZr2,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VSUB    dZi2,dYi2,dYr3
+                
+                VADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+                
+                
+            .else
+                
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+                VADD    dZr2,dYr2,dYi3
+            
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi2,dYi2,dYr3
+            
+                VSUB    qZ1,qY0,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            
+                VSUB    dZr3,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    dZi3,dYi2,dYr3
+            
+                VADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+            
+            .endif
+            
+        .endif
+        
+        BGT     grpZeroSetLoop\name
+        
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize  
+        MOV     pDst,pPingPongBuf
+        
+        
+        .endm
+
+                
+        
+        M_START armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+ 
+                
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",fwdsfs
+        M_END
+
+                
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",invsfs
+        M_END
+    
+	.end
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S
@@ -0,0 +1,404 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7767
+@// Last Modified Date:       Thu, 27 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+    
+@// Guarding implementation by the processor name
+    
+    
+@// Import symbols required from other files
+@// (For example tables)
+    @//IMPORT  armAAC_constTable    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep	r3
+#define grpCount	r4
+#define dstStep		r5
+#define grpTwStep	r8
+#define stepTwiddle	r9
+#define twStep		r10
+#define pTmp		r4
+#define step16		r11
+#define step24		r12
+
+
+@// Neon Registers
+
+#define dButterfly1Real02	D0.S32
+#define dButterfly1Imag02	D1.S32
+#define dButterfly1Real13	D2.S32
+#define dButterfly1Imag13	D3.S32
+#define dButterfly2Real02	D4.S32
+#define dButterfly2Imag02	D5.S32
+#define dButterfly2Real13	D6.S32
+#define dButterfly2Imag13	D7.S32
+#define dXr0			D0.S32
+#define dXi0			D1.S32
+#define dXr1			D2.S32
+#define dXi1			D3.S32
+#define dXr2			D4.S32
+#define dXi2			D5.S32
+#define dXr3			D6.S32
+#define dXi3			D7.S32
+
+#define dYr0			D16.S32
+#define dYi0			D17.S32
+#define dYr1			D18.S32
+#define dYi1			D19.S32
+#define dYr2			D20.S32
+#define dYi2			D21.S32
+#define dYr3			D22.S32
+#define dYi3			D23.S32
+
+#define dW1r			D8.S32
+#define dW1i			D9.S32
+#define dW2r			D10.S32
+#define dW2i			D11.S32
+#define dW3r			D12.S32
+#define dW3i			D13.S32
+#define qT0			Q7.S64
+#define qT1			Q8.S64
+#define qT2			Q9.S64
+#define qT3			Q10.S64
+#define qT4			Q11.S64
+#define qT5			Q12.S64
+
+#define dZr0			D14.S32
+#define dZi0			D15.S32
+#define dZr1			D26.S32
+#define dZi1			D27.S32
+#define dZr2			D28.S32
+#define dZi2			D29.S32
+#define dZr3			D30.S32
+#define dZi3			D31.S32
+
+#define qX0			Q0.S32
+#define qY0			Q8.S32
+#define qY1			Q9.S32   
+#define qY2			Q10.S32
+#define qY3			Q11.S32
+#define qZ0			Q7.S32
+#define qZ1			Q13.S32   
+#define qZ2			Q14.S32
+#define qZ3			Q15.S32
+
+
+        
+        .MACRO FFTSTAGE scaled, inverse , name
+        
+        @// Define stack arguments
+        
+        
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes 
+        MOV     outPointStep,subFFTSize,LSL #3
+        
+        @// Update grpCount and grpSize rightaway 
+        
+        VLD2    {dW1r,dW1i},[pTwiddle :128]                          @// [wi|wr]
+        MOV     step16,#16
+        LSL     grpCount,subFFTSize,#2
+        
+        VLD1    dW2r,[pTwiddle :64]                             @// [wi|wr]
+        MOV     subFFTNum,#1                            @//after the last stage
+        
+        VLD1    dW3r,[pTwiddle :64],step16                     @// [wi|wr]
+        MOV     stepTwiddle,#0
+        
+        VLD1    dW2i,[pTwiddle :64]!                            @// [wi|wr]
+        SUB     grpTwStep,stepTwiddle,#8                    @// grpTwStep = -8 to start with       
+        
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        VLD1    dW3i,[pTwiddle :64],grpTwStep                           @// [wi|wr]
+        MOV     dstStep,outPointStep,LSL #1
+        
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
+        MOV     step24,#24 
+
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        
+
+        @// Process two groups at a time
+        
+grpLoop\name :	
+        
+        VZIP    dW2r,dW2i
+        ADD     stepTwiddle,stepTwiddle,#16                 @// increment for the next iteration
+        VZIP    dW3r,dW3i
+        ADD     grpTwStep,stepTwiddle,#4
+        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
+        SUB     twStep,stepTwiddle,#16                      @// -16+stepTwiddle
+        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
+        MOV     grpTwStep,grpTwStep,LSL #1
+        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
+        RSB     grpTwStep,grpTwStep,#0                      @// -8-2*stepTwiddle
+        
+        
+        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
+        
+        
+        SUBS    grpCount,grpCount,#8                    @// grpCount is multiplied by 4
+                
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dW1r,dXr1
+            VMLAL   qT0,dW1i,dXi1                       @// real part
+            VMULL   qT1,dW1r,dXi1
+            VMLSL   qT1,dW1i,dXr1                       @// imag part
+                
+        .else
+        
+            VMULL   qT0,dW1r,dXr1
+            VMLSL   qT0,dW1i,dXi1                       @// real part
+            VMULL   qT1,dW1r,dXi1
+            VMLAL   qT1,dW1i,dXr1                       @// imag part
+                    
+        .endif
+        
+        VLD2    {dW1r,dW1i},[pTwiddle :128],stepTwiddle      @// [wi|wr]
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT2,dW2r,dXr2
+            VMLAL   qT2,dW2i,dXi2                       @// real part
+            VMULL   qT3,dW2r,dXi2
+            VLD1    dW2r,[pTwiddle :64],step16                  @// [wi|wr]
+            VMLSL   qT3,dW2i,dXr2                       @// imag part
+                
+        .else
+        
+            VMULL   qT2,dW2r,dXr2
+            VMLSL   qT2,dW2i,dXi2                       @// real part
+            VMULL   qT3,dW2r,dXi2
+            VLD1    dW2r,[pTwiddle :64],step16                  @// [wi|wr]
+            VMLAL   qT3,dW2i,dXr2                       @// imag part
+                    
+        .endif
+        
+        
+        VRSHRN  dZr1,qT0,#31
+        VLD1    dW2i,[pTwiddle :64],twStep                  @// [wi|wr] 
+        VRSHRN  dZi1,qT1,#31
+        
+        VMOV     qZ0,qX0                                @// move qX0 so as to load for the next iteration
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        
+                
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT4,dW3r,dXr3
+            VMLAL   qT4,dW3i,dXi3                       @// real part
+            VMULL   qT5,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLSL   qT5,dW3i,dXr3                       @// imag part
+                
+        .else
+        
+            VMULL   qT4,dW3r,dXr3
+            VMLSL   qT4,dW3i,dXi3                       @// real part
+            VMULL   qT5,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLAL   qT5,dW3i,dXr3                       @// imag part
+                    
+        .endif
+        
+        VRSHRN  dZr2,qT2,#31
+        VLD1    dW3i,[pTwiddle :64],grpTwStep                           @// [wi|wr]
+        VRSHRN  dZi2,qT3,#31
+        
+        VRSHRN  dZr3,qT4,#31
+        VRSHRN  dZi3,qT5,#31
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        
+                
+        .ifeqs "\scaled", "TRUE"
+        
+            @// finish first stage of 4 point FFT 
+            
+            VHADD    qY0,qZ0,qZ2
+            VHSUB    qY2,qZ0,qZ2
+            VHADD    qY1,qZ1,qZ3
+            VHSUB    qY3,qZ1,qZ3
+            
+                        
+            @// finish second stage of 4 point FFT 
+            
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    qZ0,qY2,qY1
+            
+                VHADD    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                                
+                VHADD    qZ2,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+            
+                VHSUB    dZr1,dYr0,dYi3
+