TUT HEVC Encoder
Loading...
Searching...
No Matches
avx2_common_functions.h
Go to the documentation of this file.
1/*****************************************************************************
2 * This file is part of Kvazaar HEVC encoder.
3 *
4 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without modification,
8 * are permitted provided that the following conditions are met:
9 *
10 * * Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright notice, this
14 * list of conditions and the following disclaimer in the documentation and/or
15 * other materials provided with the distribution.
16 *
17 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ****************************************************************************/
32
33#ifndef AVX2_COMMON_FUNCTIONS_H
34#define AVX2_COMMON_FUNCTIONS_H
35
36#include <immintrin.h>
37
38// The calling convention used by MSVC on 32-bit builds will essentially
39// disallow functions to have more than 3 XMM/YMM parameters, because it
40// will not provide more than 8-byte param alignment, and only the first
41// three vector params will be carried in SIMD registers. Now the
42// vectorcall convention could probably be problematic in globally visible
43// funcitons, but likely not in static ones.
44#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
45 #define FIX_W32 __vectorcall
46#else
47 #define FIX_W32
48#endif
49
50// Non-inline functions defined in this header are likely to trigger a
51// warning for each module including this header that does NOT use them,
52// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).
53// Tell 'em we actually want to do that, it's not an accident.
54#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
55 #define FIX_UNUSED __attribute__((unused))
56#else
57 #define FIX_UNUSED
58#endif
59
60#define FIX_NOINLINE FIX_W32 FIX_UNUSED
61
62/*
63 * Reorder coefficients from raster to scan order
64 * Fun fact: Once upon a time, doing this in a loop looked like this:
65 * for (int32_t n = 0; n < width * height; n++) {
66 * coef_reord[n] = coef[scan[n]];
67 * q_coef_reord[n] = q_coef[scan[n]];
68 * }
69 */
71{
72 // For vectorized reordering of coef and q_coef
73 const __m128i low128_shuffle_masks[3] = {
74 _mm_setr_epi8(10,11, 4, 5, 12,13, 0, 1, 6, 7, 14,15, 8, 9, 2, 3),
75 _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),
76 _mm_setr_epi8( 4, 5, 6, 7, 0, 1, 2, 3, 12,13, 14,15, 8, 9, 10,11),
77 };
78
79 const __m128i blend_masks[3] = {
80 _mm_setr_epi16( 0, 0, 0, -1, 0, 0, -1, -1),
81 _mm_setr_epi16( 0, 0, 0, 0, 0, 0, 0, 0),
82 _mm_setr_epi16( 0, 0, -1, -1, 0, 0, -1, -1),
83 };
84
86 _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 6, 7, 10,11, 4, 5, 12,13, 14,15),
87 _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),
88 _mm_setr_epi8( 0, 1, 8, 9, 4, 5, 12,13, 2, 3, 10,11, 6, 7, 14,15),
89 };
90
92 _mm_setr_epi8(12,13, 6, 7, 0, 1, 2, 3, 14,15, 4, 5, 8, 9, 10,11),
93 _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),
94 _mm_setr_epi8( 4, 5, 12,13, 0, 1, 8, 9, 6, 7, 14,15, 2, 3, 10,11),
95 };
96
97 const size_t row_offsets[4] = {
98 scan[subpos] + width * 0,
99 scan[subpos] + width * 1,
100 scan[subpos] + width * 2,
101 scan[subpos] + width * 3,
102 };
103
104 for (int i = 0; i < n_bufs; i++) {
105 const int16_t *__restrict coeff = coeffs[i];
106
107 // NOTE: Upper means "higher in pixel order inside block", which implies
108 // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
109 // so upper 128b vector actually becomes the lower part of a 256-bit coeff
110 // vector and lower vector the higher part!
113
116
119
122
123 // Zeroing these is actually unnecessary, but the compiler will whine
124 // about uninitialized values otherwise
127
128 coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0]));
129 coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1]));
130
131 coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2]));
132 coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3]));
133
136
138
141
144
145 // The Intel Intrinsics Guide talks about _mm256_setr_m128i but my headers
146 // lack such an instruction. What it does is essentially this anyway.
149 1);
150 }
151}
152
153// If ints is completely zero, returns 16 in *first and -1 in *last
155{
156 // Note that nonzero_bytes will always have both bytes set for a set word
157 // even if said word only had one of its bytes set, because we're doing 16
158 // bit wide comparisons. No big deal, just shift results to the right by one
159 // bit to have the results represent indexes of first set words, not bytes.
160 // Another note, it has to use right shift instead of division to preserve
161 // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
163
166 *first = ( (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
167 *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
168}
169
184
185#endif
static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
Definition avx2_common_functions.h:154
static int32_t hsum_8x32b(const __m256i v)
Definition avx2_common_functions.h:170
#define FIX_NOINLINE
Definition avx2_common_functions.h:60
static INLINE void scanord_read_vector(const int16_t **__restrict coeffs, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs, const int n_bufs)
Definition avx2_common_functions.h:70
#define INLINE
Definition global.h:240
#define MAX_TILES_PER_DIM
Definition global.h:232