1 files changed, 217 insertions, 0 deletions
diff --git a/graphics/lepton/files/patch-cpu b/graphics/lepton/files/patch-cpu
new file mode 100644
index 000000000000..763c149eb95e
--- /dev/null
+++ b/graphics/lepton/files/patch-cpu
@@ -0,0 +1,217 @@
+Make -- or attempt to -- the code work on CPUs with only SSE3
+instruction set...
+
+	-mi
+
+--- src/lepton/idct.cc
++++ src/lepton/idct.cc
+@@ -1,8 +1,6 @@
+ /* -*-mode:c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+-#include <emmintrin.h>
+-#include <smmintrin.h>
+-#include <immintrin.h>
+ #include "../vp8/util/aligned_block.hh"
++#include "../vp8/util/mm_mullo_epi32.hh"
+ 
+ namespace idct_local{
+ enum {
+@@ -23,7 +21,10 @@ enum {
+     r2 = 181 // 256/sqrt(2)
+ };
+ }
+-void idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) {
++
++#ifndef __SSE2__
++static void
++idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) {
+     int32_t intermed[64];
+     using namespace idct_local;
+     // Horizontal 1-D IDCT.
+@@ -149,6 +150,8 @@ void idct_scalar(const AlignedBlock &blo
+         //outp[i]>>=3;
+     }
+ }
++#else /* At least SSE2 is available { */
++
+ template<int which_vec, int offset, int stride> __m128i vget_raster(const AlignedBlock&block) {
+     return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset),
+                          block.coefficients_raster(which_vec + 2 * stride + offset),
+@@ -162,8 +165,8 @@ template<int offset, int stride> __m128i
+                                               q[which_vec + offset]));
+ }
+ 
+-
+-__m128i epi32l_to_epi16(__m128i lowvec) {
++static __m128i
++epi32l_to_epi16(__m128i lowvec) {
+     return _mm_shuffle_epi8(lowvec, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
+                                                  0xd, 0xc, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+ }
+@@ -181,9 +184,8 @@ __m128i epi32l_to_epi16(__m128i lowvec) 
+     }while(0)
+ 
+ 
+-
+-
+-void idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
++static void
++idct_sse(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
+     
+     char vintermed_storage[64 * sizeof(int32_t) + 16];
+     // align intermediate storage to 16 bytes
+@@ -202,7 +204,12 @@ void idct_sse(const AlignedBlock &block,
+             xv6 = vget_raster<0, 5, 8>(block);
+             xv7 = vget_raster<0, 3, 8>(block);
+             if (__builtin_expect(ignore_dc, true)) {
++#ifdef __SSE4_1__
+                 xv0 = _mm_insert_epi32(xv0, 0, 0);
++#else
++// See http://stackoverflow.com/questions/38384520/is-there-a-sse2-equivalent-for-mm-insert-epi32
++                xv0 = _mm_insert_epi16(_mm_insert_epi16(xv0, 0, 0), 0, 1);
++#endif
+             }
+         } else {
+             xv0 = vget_raster<32, 0, 8>(block);
+@@ -378,7 +385,8 @@ __m128i m256_to_epi16(__m256i vec) {
+ 
+     }*/
+ #if __AVX2__
+-void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
++static void
++idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
+     // align intermediate storage to 16 bytes
+     using namespace idct_local;
+     // Horizontal 1-D IDCT.
+@@ -589,11 +597,16 @@ void idct_avx(const AlignedBlock &block,
+ #endif
+     }
+ }
+-#else
+-void idct_avx(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
+-    idct_sse(block, q, voutp, ignore_dc);
+-}
+ #endif
+-void idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
++#endif /* } SSE2 or higher is available */
++
++void
++idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) {
++#ifdef __AVX2__
+     idct_avx(block, q, voutp, ignore_dc);
++#elif __SSE2__
++    idct_sse(block, q, voutp, ignore_dc);
++#else
++    idct_scalar(block, q, voutp, ignore_dc);
++#endif
+ }
+--- src/lepton/vp8_encoder.cc
++++ src/lepton/vp8_encoder.cc
+@@ -150,29 +150,34 @@ void VP8ComponentEncoder::process_row(Pr
+     }
+ }
+ uint32_t aligned_block_cost(const AlignedBlock &block) {
+-    uint32_t cost = 16; // .25 cost for zeros
+-    if (VECTORIZE) {
+-        for (int i = 0; i < 64; i+= 8) {
+-            __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i)));
+-            __m128i v_cost = _mm_set1_epi16(0);
+-            while (!_mm_test_all_zeros(val, val)) {
+-                __m128i mask = _mm_cmpgt_epi16(val, _mm_setzero_si128());
+-                v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2)));
+-                val = _mm_srli_epi16(val, 1);
+-            }
+-            __m128i sum = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8));
+-            sum = _mm_add_epi16(sum ,_mm_srli_si128(sum, 4));
+-            sum = _mm_add_epi16(sum, _mm_srli_si128(sum, 2));
+-            cost += _mm_extract_epi16(sum, 0);
+-        }
+-    } else {
+-        uint32_t scost = 0;
+-        for (int i = 0; i < 64; ++i) {
+-            scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i]));
++#ifdef __SSE2__ /* SSE2 or higher instruction set available { */
++    const __m128i zero = _mm_setzero_si128();
++     __m128i v_cost;
++    for (int i = 0; i < 64; i+= 8) {
++        __m128i val = _mm_abs_epi16(_mm_load_si128((const __m128i*)(const char*)(block.raw_data() + i)));
++        v_cost = _mm_set1_epi16(0);
++#ifndef __SSE4_1__
++        while (_mm_movemask_epi8(_mm_cmpeq_epi32(val, zero)) != 0xFFFF)
++#else
++        while (!_mm_test_all_zeros(val, val))
++#endif
++        {
++            __m128i mask = _mm_cmpgt_epi16(val, zero);
++            v_cost = _mm_add_epi16(v_cost, _mm_and_si128(mask, _mm_set1_epi16(2)));
++            val = _mm_srli_epi16(val, 1);
+         }
+-        cost = scost;
++        v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 8));
++        v_cost = _mm_add_epi16(v_cost ,_mm_srli_si128(v_cost, 4));
++        v_cost = _mm_add_epi16(v_cost, _mm_srli_si128(v_cost, 2));
+     }
+-    return cost;
++    return 16 + _mm_extract_epi16(v_cost, 0);
++#else /* } No SSE2 instructions { */
++    uint32_t scost = 0;
++    for (int i = 0; i < 64; ++i) {
++        scost += 1 + 2 * uint16bit_length(abs(block.raw_data()[i]));
++    }
++    return scost;
++#endif /* } */
+ }
+ 
+ #ifdef ALLOW_FOUR_COLORS
+--- src/vp8/model/model.hh
++++ src/vp8/model/model.hh
+@@ -11,9 +11,7 @@
+ #include "branch.hh"
+ #include "../util/aligned_block.hh"
+ #include "../util/block_based_image.hh"
+-#include <smmintrin.h>
+-#include <immintrin.h>
+-#include <emmintrin.h>
++#include "../util/mm_mullo_epi32.hh"
+ 
+ class BoolEncoder;
+ constexpr bool advanced_dc_prediction = true;
+--- src/vp8/model/numeric.hh
++++ src/vp8/model/numeric.hh
+@@ -8,8 +8,8 @@
+ // for std::min
+ #include <algorithm>
+ #include <assert.h>
+-#include <smmintrin.h>
+-#include <emmintrin.h>
++#include <immintrin.h>
++#include "../util/mm_mullo_epi32.hh"
+ 
+ #ifdef _WIN32
+ #include <intrin.h>
+--- src/vp8/util/mm_mullo_epi32.hh
++++ src/vp8/util/mm_mullo_epi32.hh
+@@ -0,0 +1,16 @@
++#if defined(__SSE2__) && !defined(__SSE4_1__) && !defined(MM_MULLO_EPI32_H)
++#define MM_MULLO_EPI32_H
++#include <immintrin.h>
++// See:	http://stackoverflow.com/questions/10500766/sse-multiplication-of-4-32-bit-integers
++// and	https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
++static inline __m128i
++_mm_mullo_epi32(const __m128i &a, const __m128i &b)
++{
++	__m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/
++	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4),
++	    _mm_srli_si128(b,4)); /* mul 3,1 */
++	return _mm_unpacklo_epi32( /* shuffle results to [63..0] and pack */
++	    _mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)),
++	    _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0)));
++}
++#endif
++++ src/lepton/recoder.cc
+@@ -99,5 +99,5 @@
+ 
+ static bool aligned_memchr16ff(const unsigned char *local_huff_data) {
+-#if 1
++#if !defined(__i386__)
+     __m128i buf = _mm_load_si128((__m128i const*)local_huff_data);
+     __m128i ff = _mm_set1_epi8(-1);