diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp new file mode 100644 --- /dev/null +++ b/gfx/ycbcr/yuv_row_win64.cpp @@ -0,0 +1,205 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" + +extern "C" { + +// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. + +#define kCoefficientsRgbU (reinterpret_cast(kCoefficientsRgbY) + 2048) +#define kCoefficientsRgbV (reinterpret_cast(kCoefficientsRgbY) + 4096) + +#include + +static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + + while (width >= 2) { + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); + + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * *y_buf++)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * *y_buf++)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * *y_buf)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + uint8 u, v, y; + int x = 0; + + while (width >= 2) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + x += source_dx; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + y = y_buf[x >> 16]; + x += source_dx; + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + __m128i xmm0, xmmY1, xmmY2; + __m128 xmmY; + uint8 u0, u1, v0, v1, y0, y1; + uint32 uv_frac, y_frac, u, v, y; + int x = 0; + + if (source_dx >= 0x20000) { + x = 32768; + } + + while(width >= 2) { + u0 = u_buf[x >> 17]; + u1 = u_buf[(x >> 17) + 1]; + v0 = v_buf[x >> 17]; + v1 = v_buf[(x >> 17) + 1]; + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + uv_frac = (x & 0x1fffe); + y_frac = (x & 0xffff); + u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; + v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; + x += source_dx; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + y_frac = (x & 0xffff); + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; + x += source_dx; + + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); + + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), + 0x44); + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); + rgb_buf += 8; + width -= 2; + } + + if (width) { + u = u_buf[x >> 17]; + v = v_buf[x >> 17]; + y = y_buf[x >> 16]; + + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); + + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); + xmmY1 = _mm_srai_epi16(xmmY1, 6); + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); + *reinterpret_cast(rgb_buf) = _mm_cvtsi128_si32(xmmY1); + } +} + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); +} + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx); +} + +} // extern "C"