media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
author Johann <johannkoenig@google.com>
Mon, 16 Jan 2017 11:51:27 -0800
changeset 376885 70a5310085fa0c77bfcf88c4180c3e3af438a44d
child 377112 58764883978d7959d480b9d2722c16f40f63bf96
permissions -rw-r--r--
Bug 1223692: Update libvpx to v1.6.0. r=rillian This includes the following patches: <patch file> <upstream review commit> <upstream hash> 1237848-check-lookahead-ctx.patch https://chromium-review.googlesource.com/324510 4f780e94a1fa54f22256e0f4d42a77c340a38fa1 block_error_fp.patch https://chromium-review.googlesource.com/282611 ff8505a54d0b3dda220f5c0695519c353c82b933 cast-char-to-uint-before-shift.patch https://chromium-review.googlesource.com/345470 2240d83d7882ce2d5d0826b9ce33b86321d7a724 clamp_abs_lvl_seg.patch https://chromium-review.googlesource.com/315754 2e693eb80e705ea68e23eed19616d22b4778b45a clamp-abs-QIndex.patch https://chromium-review.googlesource.com/315802 ff3674a15e5b1a006546e1edc64c3e778eb34ab1 rename_duplicate_files.patch https://chromium-review.googlesource.com/281967 6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c https://chromium-review.googlesource.com/317880 d36659cec7fab96cedc67db4d511ed7135637d0e vp9_filter_restore_aligment.patch https://chromium-review.googlesource.com/276889 33b3953c548a20c0aee705657df0440a740c28b7 vpx_once.patch https://chromium-review.googlesource.com/312467 2635573a7f2e4bbd259379acf91efb97d983359f MozReview-Commit-ID: R7qB6egl3Z

/*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "./vp9_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"

static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
                                            uint32_t stride,
                                            uint8_t *frm2_ptr,
                                            int32_t filt_sth,
                                            int32_t filt_wgt,
                                            uint32_t *acc,
                                            uint16_t *cnt) {
  uint32_t row;
  uint64_t f0, f1, f2, f3;
  v16i8 frm2, frm1 = { 0 };
  v16i8 frm4, frm3 = { 0 };
  v16u8 frm_r, frm_l;
  v8i16 frm2_r, frm2_l;
  v8i16 diff0, diff1, mod0_h, mod1_h;
  v4i32 cnst3, cnst16, filt_wt, strength;
  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
  v4i32 acc0, acc1, acc2, acc3;
  v8i16 cnt0, cnt1;

  filt_wt = __msa_fill_w(filt_wgt);
  strength = __msa_fill_w(filt_sth);
  cnst3 = __msa_ldi_w(3);
  cnst16 = __msa_ldi_w(16);

  for (row = 2; row--;) {
    LD4(frm1_ptr, stride, f0, f1, f2, f3);
    frm1_ptr += (4 * stride);

    LD_SB2(frm2_ptr, 16, frm2, frm4);
    frm2_ptr += 32;

    LD_SW2(acc, 4, acc0, acc1);
    LD_SW2(acc + 8, 4, acc2, acc3);
    LD_SH2(cnt, 8, cnt0, cnt1);

    INSERT_D2_SB(f0, f1, frm1);
    INSERT_D2_SB(f2, f3, frm3);
    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
         mod0_w, mod1_w, mod2_w, mod3_w);
    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

    diff0_r = (mod0_w < cnst16);
    diff0_l = (mod1_w < cnst16);
    diff1_r = (mod2_w < cnst16);
    diff1_l = (mod3_w < cnst16);

    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
         mod0_w, mod1_w, mod2_w, mod3_w);

    mod0_w = diff0_r & mod0_w;
    mod1_w = diff0_l & mod1_w;
    mod2_w = diff1_r & mod2_w;
    mod3_w = diff1_l & mod3_w;

    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
         mod0_w, mod1_w, mod2_w, mod3_w);
    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    ST_SH2(mod0_h, mod1_h, cnt, 8);
    cnt += 16;

    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
         mod0_w, mod1_w, mod2_w, mod3_w);
    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
         mod0_w, mod1_w, mod2_w, mod3_w);

    ST_SW2(mod0_w, mod1_w, acc, 4);
    acc += 8;
    ST_SW2(mod2_w, mod3_w, acc, 4);
    acc += 8;

    LD_SW2(acc, 4, acc0, acc1);
    LD_SW2(acc + 8, 4, acc2, acc3);
    LD_SH2(cnt, 8, cnt0, cnt1);

    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
         mod0_w, mod1_w, mod2_w, mod3_w);
    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

    diff0_r = (mod0_w < cnst16);
    diff0_l = (mod1_w < cnst16);
    diff1_r = (mod2_w < cnst16);
    diff1_l = (mod3_w < cnst16);

    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
         mod0_w, mod1_w, mod2_w, mod3_w);

    mod0_w = diff0_r & mod0_w;
    mod1_w = diff0_l & mod1_w;
    mod2_w = diff1_r & mod2_w;
    mod3_w = diff1_l & mod3_w;

    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
         mod0_w, mod1_w, mod2_w, mod3_w);
    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    ST_SH2(mod0_h, mod1_h, cnt, 8);
    cnt += 16;
    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
         mod0_w, mod1_w, mod2_w, mod3_w);
    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
         mod0_w, mod1_w, mod2_w, mod3_w);

    ST_SW2(mod0_w, mod1_w, acc, 4);
    acc += 8;
    ST_SW2(mod2_w, mod3_w, acc, 4);
    acc += 8;
  }
}

static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
                                             uint32_t stride,
                                             uint8_t *frm2_ptr,
                                             int32_t filt_sth,
                                             int32_t filt_wgt,
                                             uint32_t *acc,
                                             uint16_t *cnt) {
  uint32_t row;
  v16i8 frm1, frm2, frm3, frm4;
  v16u8 frm_r, frm_l;
  v16i8 zero = { 0 };
  v8u16 frm2_r, frm2_l;
  v8i16 diff0, diff1, mod0_h, mod1_h;
  v4i32 cnst3, cnst16, filt_wt, strength;
  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
  v4i32 acc0, acc1, acc2, acc3;
  v8i16 cnt0, cnt1;

  filt_wt = __msa_fill_w(filt_wgt);
  strength = __msa_fill_w(filt_sth);
  cnst3 = __msa_ldi_w(3);
  cnst16 = __msa_ldi_w(16);

  for (row = 8; row--;) {
    LD_SB2(frm1_ptr, stride, frm1, frm3);
    frm1_ptr += stride;

    LD_SB2(frm2_ptr, 16, frm2, frm4);
    frm2_ptr += 16;

    LD_SW2(acc, 4, acc0, acc1);
    LD_SW2(acc, 4, acc2, acc3);
    LD_SH2(cnt, 8, cnt0, cnt1);

    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
         mod0_w, mod1_w, mod2_w, mod3_w);
    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
         mod0_w, mod1_w, mod2_w, mod3_w);
    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

    diff0_r = (mod0_w < cnst16);
    diff0_l = (mod1_w < cnst16);
    diff1_r = (mod2_w < cnst16);
    diff1_l = (mod3_w < cnst16);

    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
         mod0_w, mod1_w, mod2_w, mod3_w);

    mod0_w = diff0_r & mod0_w;
    mod1_w = diff0_l & mod1_w;
    mod2_w = diff1_r & mod2_w;
    mod3_w = diff1_l & mod3_w;

    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
         mod0_w, mod1_w, mod2_w, mod3_w);
    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    ST_SH2(mod0_h, mod1_h, cnt, 8);
    cnt += 16;

    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
         mod0_w, mod1_w, mod2_w, mod3_w);
    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
         mod0_w, mod1_w, mod2_w, mod3_w);

    ST_SW2(mod0_w, mod1_w, acc, 4);
    acc += 8;
    ST_SW2(mod2_w, mod3_w, acc, 4);
    acc += 8;

    LD_SW2(acc, 4, acc0, acc1);
    LD_SW2(acc + 8, 4, acc2, acc3);
    LD_SH2(cnt, 8, cnt0, cnt1);

    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
         mod0_w, mod1_w, mod2_w, mod3_w);
    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
         mod0_w, mod1_w, mod2_w, mod3_w);
    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);

    diff0_r = (mod0_w < cnst16);
    diff0_l = (mod1_w < cnst16);
    diff1_r = (mod2_w < cnst16);
    diff1_l = (mod3_w < cnst16);

    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
         mod0_w, mod1_w, mod2_w, mod3_w);

    mod0_w = diff0_r & mod0_w;
    mod1_w = diff0_l & mod1_w;
    mod2_w = diff1_r & mod2_w;
    mod3_w = diff1_l & mod3_w;

    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
         mod0_w, mod1_w, mod2_w, mod3_w);
    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    ST_SH2(mod0_h, mod1_h, cnt, 8);
    cnt += 16;

    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
         mod0_w, mod1_w, mod2_w, mod3_w);
    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
         mod0_w, mod1_w, mod2_w, mod3_w);
    ST_SW2(mod0_w, mod1_w, acc, 4);
    acc += 8;
    ST_SW2(mod2_w, mod3_w, acc, 4);
    acc += 8;

    frm1_ptr += stride;
    frm2_ptr += 16;
  }
}

void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
                                   uint8_t *frame2_ptr, uint32_t blk_w,
                                   uint32_t blk_h, int32_t strength,
                                   int32_t filt_wgt, uint32_t *accu,
                                   uint16_t *cnt) {
  if (8 == (blk_w * blk_h)) {
    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
                                    strength, filt_wgt, accu, cnt);
  } else if (16 == (blk_w * blk_h)) {
    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
                                     strength, filt_wgt, accu, cnt);
  } else {
    vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
                                strength, filt_wgt, accu, cnt);
  }
}