diff options
author | Mikhail Teterin <mi@FreeBSD.org> | 2016-07-15 03:05:26 +0000 |
---|---|---|
committer | Mikhail Teterin <mi@FreeBSD.org> | 2016-07-15 03:05:26 +0000 |
commit | afb2268a389d534a39f5930da34cdd1b2219f654 (patch) | |
tree | 1c5e9252a97e819638faddc307b3cf7b9a8fdf43 /graphics/lepton/files/patch-cpu | |
parent | Add port of "lepton" -- a tool for manipulating files in LEP-format, (diff) |
Finish up the port, which got committed too early by accident:
. Note, that it works on CPUs with at least SSSE3
instruction set -- the original code assumes SSE4,
but that was relatively easy to patch
. Do not attempt to build, if SSSE3 not among CPU-options
. Fix up formatting warnings (reported upstream)
. Fix a crash on i386. Unfortunately, 5 of the 40
self-tests still fail on i386 -- the problem reported
upstream
Notes
Notes:
svn path=/head/; revision=418568
Diffstat (limited to 'graphics/lepton/files/patch-cpu')
-rw-r--r-- | graphics/lepton/files/patch-cpu | 217 |
1 files changed, 217 insertions, 0 deletions
diff --git a/graphics/lepton/files/patch-cpu b/graphics/lepton/files/patch-cpu new file mode 100644 index 000000000000..763c149eb95e --- /dev/null +++ b/graphics/lepton/files/patch-cpu @@ -0,0 +1,217 @@ +Make -- or attempt to -- the code work on CPUs with only SSE3 +instruction set... + + -mi + +--- src/lepton/idct.cc ++++ src/lepton/idct.cc +@@ -1,8 +1,6 @@ + /* -*-mode:c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +-#include <emmintrin.h> +-#include <smmintrin.h> +-#include <immintrin.h> + #include "../vp8/util/aligned_block.hh" ++#include "../vp8/util/mm_mullo_epi32.hh" + + namespace idct_local{ + enum { +@@ -23,7 +21,10 @@ enum { + r2 = 181 // 256/sqrt(2) + }; + } +-void idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) { ++ ++#ifndef __SSE2__ ++static void ++idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) { + int32_t intermed[64]; + using namespace idct_local; + // Horizontal 1-D IDCT. +@@ -149,6 +150,8 @@ void idct_scalar(const AlignedBlock &blo + //outp[i]>>=3; + } + } ++#else /* At least SSE2 is available { */ ++ + template<int which_vec, int offset, int stride> __m128i vget_raster(const AlignedBlock&block) { + return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset), + block.coefficients_raster(which_vec + 2 * stride + offset), +@@ -162,8 +165,8 @@ template<int offset, int stride> __m128i + q[which_vec + offset])); + } + +- +-__m128i epi32l_to_epi16(__m128i lowvec) { ++static __m128i ++epi32l_to_epi16(__m128i lowvec) { + return _mm_shuffle_epi8(lowvec, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, + 0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0)); + } +@@ -181,9 +184,8 @@ __m128i epi32l_to_epi16(__m128i lowvec) + }while(0) + + +- +- +-void idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { ++static void ++idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { + + char vintermed_storage[64 * sizeof(int32_t) + 16]; + // align intermediate storage to 16 bytes +@@ -202,7 +204,12 @@ void idct_sse(const AlignedBlock &block, + xv6 = vget_raster<0, 5, 8>(block); + xv7 = vget_raster<0, 3, 8>(block); + if (__builtin_expect(ignore_dc, true)) { ++#ifdef __SSE4_1__ + xv0 = _mm_insert_epi32(xv0, 0, 0); ++#else ++// See http://stackoverflow.com/questions/38384520/is-there-a-sse2-equivalent-for-mm-insert-epi32 ++ xv0 = _mm_insert_epi16(_mm_insert_epi16(xv0, 0, 0), 0, 1); ++#endif + } + } else { + xv0 = vget_raster<32, 0, 8>(block); +@@ -378,7 +385,8 @@ __m128i m256_to_epi16(__m256i vec) { + + }*/ + #if __AVX2__ +-void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { ++static void ++idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { + // align intermediate storage to 16 bytes + using namespace idct_local; + // Horizontal 1-D IDCT. +@@ -589,11 +597,16 @@ void idct_avx(const AlignedBlock &block, + #endif + } + } +-#else +-void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { +- idct_sse(block, q, voutp, ignore_dc); +-} + #endif +-void idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { ++#endif /* } SSE2 or higher is available */ ++ ++void ++idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { ++#ifdef __AVX2__ + idct_avx(block, q, voutp, ignore_dc); ++#elif __SSE2__ ++ idct_sse(block, q, voutp, ignore_dc); ++#else ++ idct_scalar(block, q, voutp, ignore_dc); ++#endif + } +--- src/lepton/vp8_encoder.cc ++++ src/lepton/vp8_encoder.cc +@@ -150,29 +150,34 @@ void VP8ComponentEncoder::process_row(Pr + } + } + uint32_t aligned_block_cost(const AlignedBlock &block) { +- uint32_t cost = 16; // .25 cost for zeros +- if (VECTORIZE) { +- for (int i = 0; i < 64; i+= 8) { +- __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i))); +- __m128i v_cost = _mm_set1_epi16(0); +- while (!_mm_test_all_zeros(val, val)) { +- __m128i mask = _mm_cmpgt_epi16(val, _mm_setzero_si128()); +- v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2))); +- val = _mm_srli_epi16(val, 1); +- } +- __m128i sum = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8)); +- sum = _mm_add_epi16(sum ,_mm_srli_si128(sum, 4)); +- sum = _mm_add_epi16(sum, _mm_srli_si128(sum, 2)); +- cost += _mm_extract_epi16(sum, 0); +- } +- } else { +- uint32_t scost = 0; +- for (int i = 0; i < 64; ++i) { +- scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i])); ++#ifdef __SSE2__ /* SSE2 or higher instruction set available { */ ++ const __m128i zero = _mm_setzero_si128(); ++ __m128i v_cost; ++ for (int i = 0; i < 64; i+= 8) { ++ __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i))); ++ v_cost = _mm_set1_epi16(0); ++#ifndef __SSE4_1__ ++ while (_mm_movemask_epi8(_mm_cmpeq_epi32(val, zero)) != 0xFFFF) ++#else ++ while (!_mm_test_all_zeros(val, val)) ++#endif ++ { ++ __m128i mask = _mm_cmpgt_epi16(val, zero); ++ v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2))); ++ val = _mm_srli_epi16(val, 1); + } +- cost = scost; ++ v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8)); ++ v_cost = _mm_add_epi16(v_cost ,_mm_srli_si128(v_cost, 4)); ++ v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 2)); + } +- return cost; ++ return 16 + _mm_extract_epi16(v_cost, 0); ++#else /* } No SSE2 instructions { */ ++ uint32_t scost = 0; ++ for (int i = 0; i < 64; ++i) { ++ scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i])); ++ } ++ return scost; ++#endif /* } */ + } + + #ifdef ALLOW_FOUR_COLORS +--- src/vp8/model/model.hh ++++ src/vp8/model/model.hh +@@ -11,9 +11,7 @@ + #include "branch.hh" + #include "../util/aligned_block.hh" + #include "../util/block_based_image.hh" +-#include <smmintrin.h> +-#include <immintrin.h> +-#include <emmintrin.h> ++#include "../util/mm_mullo_epi32.hh" + + class BoolEncoder; + constexpr bool advanced_dc_prediction = true; +--- src/vp8/model/numeric.hh ++++ src/vp8/model/numeric.hh +@@ -8,8 +8,8 @@ + // for std::min + #include <algorithm> + #include <assert.h> +-#include <smmintrin.h> +-#include <emmintrin.h> ++#include <immintrin.h> ++#include "../util/mm_mullo_epi32.hh" + + #ifdef _WIN32 + #include <intrin.h> +--- src/vp8/util/mm_mullo_epi32.hh ++++ src/vp8/util/mm_mullo_epi32.hh +@@ -0,0 +1,16 @@ ++#if defined(__SSE2__) && !defined(__SSE4_1__) && !defined(MM_MULLO_EPI32_H) ++#define MM_MULLO_EPI32_H ++#include <immintrin.h> ++// See: http://stackoverflow.com/questions/10500766/sse-multiplication-of-4-32-bit-integers ++// and https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768 ++static inline __m128i ++_mm_mullo_epi32(const __m128i &a, const __m128i &b) ++{ ++ __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/ ++ __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4), ++ _mm_srli_si128(b,4)); /* mul 3,1 */ ++ return _mm_unpacklo_epi32( /* shuffle results to [63..0] and pack */ ++ _mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), ++ _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); ++} ++#endif ++++ src/lepton/recoder.cc +@@ -99,5 +99,5 @@ + + static bool aligned_memchr16ff(const unsigned char *local_huff_data) { +-#if 1 ++#if !defined(__i386__) + __m128i buf = _mm_load_si128((__m128i const*)local_huff_data); + __m128i ff = _mm_set1_epi8(-1); |