diff options
Diffstat (limited to 'graphics/lepton/files/patch-cpu')
-rw-r--r-- | graphics/lepton/files/patch-cpu | 217 |
1 files changed, 0 insertions, 217 deletions
diff --git a/graphics/lepton/files/patch-cpu b/graphics/lepton/files/patch-cpu deleted file mode 100644 index 4ee98d914194..000000000000 --- a/graphics/lepton/files/patch-cpu +++ /dev/null @@ -1,217 +0,0 @@ -Make -- or attempt to -- the code work on CPUs with only SSSE3 -instruction set... - - -mi - ---- src/lepton/idct.cc -+++ src/lepton/idct.cc -@@ -1,8 +1,7 @@ - /* -*-mode:c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ --#include <emmintrin.h> --#include <smmintrin.h> - #include <immintrin.h> - #include "../vp8/util/aligned_block.hh" -+#include "../vp8/util/mm_mullo_epi32.hh" - - namespace idct_local{ - enum { -@@ -23,7 +21,10 @@ enum { - r2 = 181 // 256/sqrt(2) - }; - } --void idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) { -+ -+#ifndef __SSE2__ -+static void -+idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) { - int32_t intermed[64]; - using namespace idct_local; - // Horizontal 1-D IDCT. -@@ -149,6 +150,8 @@ void idct_scalar(const AlignedBlock &blo - //outp[i]>>=3; - } - } -+#else /* At least SSE2 is available { */ -+ - template<int which_vec, int offset, int stride> __m128i vget_raster(const AlignedBlock&block) { - return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset), - block.coefficients_raster(which_vec + 2 * stride + offset), -@@ -162,8 +165,8 @@ template<int offset, int stride> __m128i - q[which_vec + offset])); - } - -- --__m128i epi32l_to_epi16(__m128i lowvec) { -+static __m128i -+epi32l_to_epi16(__m128i lowvec) { - return _mm_shuffle_epi8(lowvec, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, - 0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0)); - } -@@ -181,9 +184,8 @@ __m128i epi32l_to_epi16(__m128i lowvec) - }while(0) - - -- -- --void idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { -+static void -+idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { - - char vintermed_storage[64 * sizeof(int32_t) + 16]; - // align intermediate storage to 16 bytes -@@ -202,7 +204,12 @@ void idct_sse(const AlignedBlock &block, - xv6 = vget_raster<0, 5, 8>(block); - xv7 = vget_raster<0, 3, 8>(block); - if (__builtin_expect(ignore_dc, true)) { -+#ifdef __SSE4_1__ - xv0 = _mm_insert_epi32(xv0, 0, 0); -+#else -+// See http://stackoverflow.com/questions/38384520/is-there-a-sse2-equivalent-for-mm-insert-epi32 -+ xv0 = _mm_and_si128(xv0, _mm_set_epi32(-1,-1,-1, 0)); -+#endif - } - } else { - xv0 = vget_raster<32, 0, 8>(block); -@@ -378,7 +385,8 @@ __m128i m256_to_epi16(__m256i vec) { - - }*/ - #if __AVX2__ --void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { -+static void -+idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { - // align intermediate storage to 16 bytes - using namespace idct_local; - // Horizontal 1-D IDCT. -@@ -589,11 +597,16 @@ void idct_avx(const AlignedBlock &block, - #endif - } - } --#else --void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { -- idct_sse(block, q, voutp, ignore_dc); --} - #endif --void idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { -+#endif /* } SSE2 or higher is available */ -+ -+void -+idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { -+#ifdef __AVX2__ - idct_avx(block, q, voutp, ignore_dc); -+#elif __SSE2__ -+ idct_sse(block, q, voutp, ignore_dc); -+#else -+ idct_scalar(block, q, voutp, ignore_dc); -+#endif - } ---- src/lepton/vp8_encoder.cc -+++ src/lepton/vp8_encoder.cc -@@ -150,29 +150,34 @@ void VP8ComponentEncoder::process_row(Pr - } - } - uint32_t aligned_block_cost(const AlignedBlock &block) { -- uint32_t cost = 16; // .25 cost for zeros -- if (VECTORIZE) { -- for (int i = 0; i < 64; i+= 8) { -- __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i))); -- __m128i v_cost = _mm_set1_epi16(0); -- while (!_mm_test_all_zeros(val, val)) { -- __m128i mask = _mm_cmpgt_epi16(val, _mm_setzero_si128()); -- v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2))); -- val = _mm_srli_epi16(val, 1); -- } -- __m128i sum = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8)); -- sum = _mm_add_epi16(sum ,_mm_srli_si128(sum, 4)); -- sum = _mm_add_epi16(sum, _mm_srli_si128(sum, 2)); -- cost += _mm_extract_epi16(sum, 0); -- } -- } else { -- uint32_t scost = 0; -- for (int i = 0; i < 64; ++i) { -- scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i])); -+#ifdef __SSE2__ /* SSE2 or higher instruction set available { */ -+ const __m128i zero = _mm_setzero_si128(); -+ __m128i v_cost; -+ for (int i = 0; i < 64; i+= 8) { -+ __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i))); -+ v_cost = _mm_set1_epi16(0); -+#ifndef __SSE4_1__ -+ while (_mm_movemask_epi8(_mm_cmpeq_epi32(val, zero)) != 0xFFFF) -+#else -+ while (!_mm_test_all_zeros(val, val)) -+#endif -+ { -+ __m128i mask = _mm_cmpgt_epi16(val, zero); -+ v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2))); -+ val = _mm_srli_epi16(val, 1); - } -- cost = scost; -+ v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8)); -+ v_cost = _mm_add_epi16(v_cost ,_mm_srli_si128(v_cost, 4)); -+ v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 2)); - } -- return cost; -+ return 16 + _mm_extract_epi16(v_cost, 0); -+#else /* } No SSE2 instructions { */ -+ uint32_t scost = 0; -+ for (int i = 0; i < 64; ++i) { -+ scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i])); -+ } -+ return scost; -+#endif /* } */ - } - - #ifdef ALLOW_FOUR_COLORS ---- src/vp8/model/model.hh -+++ src/vp8/model/model.hh -@@ -11,9 +11,7 @@ - #include "branch.hh" - #include "../util/aligned_block.hh" - #include "../util/block_based_image.hh" --#include <smmintrin.h> --#include <immintrin.h> --#include <emmintrin.h> -+#include "../util/mm_mullo_epi32.hh" - - class BoolEncoder; - constexpr bool advanced_dc_prediction = true; ---- src/vp8/model/numeric.hh -+++ src/vp8/model/numeric.hh -@@ -8,8 +8,8 @@ - // for std::min - #include <algorithm> - #include <assert.h> --#include <smmintrin.h> --#include <emmintrin.h> -+#include <immintrin.h> -+#include "../util/mm_mullo_epi32.hh" - - #ifdef _WIN32 - #include <intrin.h> ---- src/vp8/util/mm_mullo_epi32.hh -+++ src/vp8/util/mm_mullo_epi32.hh -@@ -0,0 +1,16 @@ -+#if defined(__SSE2__) && !defined(__SSE4_1__) && !defined(MM_MULLO_EPI32_H) -+#define MM_MULLO_EPI32_H -+#include <immintrin.h> -+// See: http://stackoverflow.com/questions/10500766/sse-multiplication-of-4-32-bit-integers -+// and https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768 -+static inline __m128i -+_mm_mullo_epi32(const __m128i &a, const __m128i &b) -+{ -+ __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/ -+ __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4), -+ _mm_srli_si128(b,4)); /* mul 3,1 */ -+ return _mm_unpacklo_epi32( /* shuffle results to [63..0] and pack */ -+ _mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), -+ _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); -+} -+#endif -+++ src/lepton/recoder.cc -@@ -99,5 +99,5 @@ - - static bool aligned_memchr16ff(const unsigned char *local_huff_data) { --#if 1 -+#if !defined(__i386__) - __m128i buf = _mm_load_si128((__m128i const*)local_huff_data); - __m128i ff = _mm_set1_epi8(-1); |