summaryrefslogtreecommitdiff
path: root/graphics/lepton/files/patch-cpu
diff options
context:
space:
mode:
Diffstat (limited to 'graphics/lepton/files/patch-cpu')
-rw-r--r--graphics/lepton/files/patch-cpu217
1 files changed, 0 insertions, 217 deletions
diff --git a/graphics/lepton/files/patch-cpu b/graphics/lepton/files/patch-cpu
deleted file mode 100644
index 4ee98d914194..000000000000
--- a/graphics/lepton/files/patch-cpu
+++ /dev/null
@@ -1,217 +0,0 @@
-Make -- or attempt to -- the code work on CPUs with only SSSE3
-instruction set...
-
- -mi
-
---- src/lepton/idct.cc
-+++ src/lepton/idct.cc
-@@ -1,8 +1,7 @@
- /* -*-mode:c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
--#include <emmintrin.h>
--#include <smmintrin.h>
- #include <immintrin.h>
- #include "../vp8/util/aligned_block.hh"
-+#include "../vp8/util/mm_mullo_epi32.hh"
-
- namespace idct_local{
- enum {
-@@ -23,7 +21,10 @@ enum {
- r2 = 181 // 256/sqrt(2)
- };
- }
--void idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) {
-+
-+#ifndef __SSE2__
-+static void
-+idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) {
- int32_t intermed[64];
- using namespace idct_local;
- // Horizontal 1-D IDCT.
-@@ -149,6 +150,8 @@ void idct_scalar(const AlignedBlock &blo
- //outp[i]>>=3;
- }
- }
-+#else /* At least SSE2 is available { */
-+
- template<int which_vec, int offset, int stride> __m128i vget_raster(const AlignedBlock&block) {
- return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset),
- block.coefficients_raster(which_vec + 2 * stride + offset),
-@@ -162,8 +165,8 @@ template<int offset, int stride> __m128i
- q[which_vec + offset]));
- }
-
--
--__m128i epi32l_to_epi16(__m128i lowvec) {
-+static __m128i
-+epi32l_to_epi16(__m128i lowvec) {
- return _mm_shuffle_epi8(lowvec, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
- 0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
- }
-@@ -181,9 +184,8 @@ __m128i epi32l_to_epi16(__m128i lowvec)
- }while(0)
-
-
--
--
--void idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
-+static void
-+idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
-
- char vintermed_storage[64 * sizeof(int32_t) + 16];
- // align intermediate storage to 16 bytes
-@@ -202,7 +204,12 @@ void idct_sse(const AlignedBlock &block,
- xv6 = vget_raster<0, 5, 8>(block);
- xv7 = vget_raster<0, 3, 8>(block);
- if (__builtin_expect(ignore_dc, true)) {
-+#ifdef __SSE4_1__
- xv0 = _mm_insert_epi32(xv0, 0, 0);
-+#else
-+// See http://stackoverflow.com/questions/38384520/is-there-a-sse2-equivalent-for-mm-insert-epi32
-+ xv0 = _mm_and_si128(xv0, _mm_set_epi32(-1,-1,-1, 0));
-+#endif
- }
- } else {
- xv0 = vget_raster<32, 0, 8>(block);
-@@ -378,7 +385,8 @@ __m128i m256_to_epi16(__m256i vec) {
-
- }*/
- #if __AVX2__
--void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
-+static void
-+idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
- // align intermediate storage to 16 bytes
- using namespace idct_local;
- // Horizontal 1-D IDCT.
-@@ -589,11 +597,16 @@ void idct_avx(const AlignedBlock &block,
- #endif
- }
- }
--#else
--void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
-- idct_sse(block, q, voutp, ignore_dc);
--}
- #endif
--void idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
-+#endif /* } SSE2 or higher is available */
-+
-+void
-+idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
-+#ifdef __AVX2__
- idct_avx(block, q, voutp, ignore_dc);
-+#elif __SSE2__
-+ idct_sse(block, q, voutp, ignore_dc);
-+#else
-+ idct_scalar(block, q, voutp, ignore_dc);
-+#endif
- }
---- src/lepton/vp8_encoder.cc
-+++ src/lepton/vp8_encoder.cc
-@@ -150,29 +150,34 @@ void VP8ComponentEncoder::process_row(Pr
- }
- }
- uint32_t aligned_block_cost(const AlignedBlock &block) {
-- uint32_t cost = 16; // .25 cost for zeros
-- if (VECTORIZE) {
-- for (int i = 0; i < 64; i+= 8) {
-- __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i)));
-- __m128i v_cost = _mm_set1_epi16(0);
-- while (!_mm_test_all_zeros(val, val)) {
-- __m128i mask = _mm_cmpgt_epi16(val, _mm_setzero_si128());
-- v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2)));
-- val = _mm_srli_epi16(val, 1);
-- }
-- __m128i sum = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8));
-- sum = _mm_add_epi16(sum ,_mm_srli_si128(sum, 4));
-- sum = _mm_add_epi16(sum, _mm_srli_si128(sum, 2));
-- cost += _mm_extract_epi16(sum, 0);
-- }
-- } else {
-- uint32_t scost = 0;
-- for (int i = 0; i < 64; ++i) {
-- scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i]));
-+#ifdef __SSE2__ /* SSE2 or higher instruction set available { */
-+ const __m128i zero = _mm_setzero_si128();
-+ __m128i v_cost;
-+ for (int i = 0; i < 64; i+= 8) {
-+ __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i)));
-+ v_cost = _mm_set1_epi16(0);
-+#ifndef __SSE4_1__
-+ while (_mm_movemask_epi8(_mm_cmpeq_epi32(val, zero)) != 0xFFFF)
-+#else
-+ while (!_mm_test_all_zeros(val, val))
-+#endif
-+ {
-+ __m128i mask = _mm_cmpgt_epi16(val, zero);
-+ v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2)));
-+ val = _mm_srli_epi16(val, 1);
- }
-- cost = scost;
-+ v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8));
-+ v_cost = _mm_add_epi16(v_cost ,_mm_srli_si128(v_cost, 4));
-+ v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 2));
- }
-- return cost;
-+ return 16 + _mm_extract_epi16(v_cost, 0);
-+#else /* } No SSE2 instructions { */
-+ uint32_t scost = 0;
-+ for (int i = 0; i < 64; ++i) {
-+ scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i]));
-+ }
-+ return scost;
-+#endif /* } */
- }
-
- #ifdef ALLOW_FOUR_COLORS
---- src/vp8/model/model.hh
-+++ src/vp8/model/model.hh
-@@ -11,9 +11,7 @@
- #include "branch.hh"
- #include "../util/aligned_block.hh"
- #include "../util/block_based_image.hh"
--#include <smmintrin.h>
--#include <immintrin.h>
--#include <emmintrin.h>
-+#include "../util/mm_mullo_epi32.hh"
-
- class BoolEncoder;
- constexpr bool advanced_dc_prediction = true;
---- src/vp8/model/numeric.hh
-+++ src/vp8/model/numeric.hh
-@@ -8,8 +8,8 @@
- // for std::min
- #include <algorithm>
- #include <assert.h>
--#include <smmintrin.h>
--#include <emmintrin.h>
-+#include <immintrin.h>
-+#include "../util/mm_mullo_epi32.hh"
-
- #ifdef _WIN32
- #include <intrin.h>
---- src/vp8/util/mm_mullo_epi32.hh
-+++ src/vp8/util/mm_mullo_epi32.hh
-@@ -0,0 +1,16 @@
-+#if defined(__SSE2__) && !defined(__SSE4_1__) && !defined(MM_MULLO_EPI32_H)
-+#define MM_MULLO_EPI32_H
-+#include <immintrin.h>
-+// See: http://stackoverflow.com/questions/10500766/sse-multiplication-of-4-32-bit-integers
-+// and https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
-+static inline __m128i
-+_mm_mullo_epi32(const __m128i &a, const __m128i &b)
-+{
-+ __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/
-+ __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4),
-+ _mm_srli_si128(b,4)); /* mul 3,1 */
-+ return _mm_unpacklo_epi32( /* shuffle results to [63..0] and pack */
-+ _mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)),
-+ _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0)));
-+}
-+#endif
-+++ src/lepton/recoder.cc
-@@ -99,5 +99,5 @@
-
- static bool aligned_memchr16ff(const unsigned char *local_huff_data) {
--#if 1
-+#if !defined(__i386__)
- __m128i buf = _mm_load_si128((__m128i const*)local_huff_data);
- __m128i ff = _mm_set1_epi8(-1);