feat: godot-engine-source-4.3-stable

2025-01-17 16:36:38 +01:00 · 2025-01-17 16:36:38 +01:00 · 7125d019b5
commit 7125d019b5
parent c59a7dcade
11149 changed files with 5070401 additions and 0 deletions
--- a/engine/thirdparty/basis_universal/encoder/basisu_backend.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_backend.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_backend.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_backend.h
@ -0,0 +1,409 @@
+// basisu_backend.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "../transcoder/basisu.h"
+#include "basisu_enc.h"
+#include "../transcoder/basisu_transcoder_internal.h"
+#include "basisu_frontend.h"
+
+namespace basisu
+{
+	struct etc1_selector_palette_entry
+	{
+		etc1_selector_palette_entry()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			basisu::clear_obj(*this);
+		}
+
+		uint8_t operator[] (uint32_t i) const { assert(i < 16); return m_selectors[i]; }
+		uint8_t& operator[] (uint32_t i) { assert(i < 16); return m_selectors[i]; }
+
+		void set_uint32(uint32_t v)
+		{
+			for (uint32_t byte_index = 0; byte_index < 4; byte_index++)
+			{
+				uint32_t b = (v >> (byte_index * 8)) & 0xFF;
+
+				m_selectors[byte_index * 4 + 0] = b & 3;
+				m_selectors[byte_index * 4 + 1] = (b >> 2) & 3;
+				m_selectors[byte_index * 4 + 2] = (b >> 4) & 3;
+				m_selectors[byte_index * 4 + 3] = (b >> 6) & 3;
+			}
+		}
+
+		uint32_t get_uint32() const
+		{
+			return get_byte(0) | (get_byte(1) << 8) | (get_byte(2) << 16) | (get_byte(3) << 24);
+		}
+
+		uint32_t get_byte(uint32_t byte_index) const
+		{
+			assert(byte_index < 4);
+
+			return m_selectors[byte_index * 4 + 0] |
+				(m_selectors[byte_index * 4 + 1] << 2) |
+				(m_selectors[byte_index * 4 + 2] << 4) |
+				(m_selectors[byte_index * 4 + 3] << 6);
+		}
+
+		uint8_t operator()(uint32_t x, uint32_t y) const { assert((x < 4) && (y < 4)); return m_selectors[x + y * 4]; }
+		uint8_t& operator()(uint32_t x, uint32_t y) { assert((x < 4) && (y < 4)); return m_selectors[x + y * 4]; }
+
+		bool operator< (const etc1_selector_palette_entry& other) const
+		{
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				if (m_selectors[i] < other.m_selectors[i])
+					return true;
+				else if (m_selectors[i] != other.m_selectors[i])
+					return false;
+			}
+
+			return false;
+		}
+
+		bool operator== (const etc1_selector_palette_entry& other) const
+		{
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				if (m_selectors[i] != other.m_selectors[i])
+					return false;
+			}
+
+			return true;
+		}
+
+	private:
+		uint8_t m_selectors[16];
+	};
+
+	typedef basisu::vector<etc1_selector_palette_entry> etc1_selector_palette_entry_vec;
+
+	struct encoder_block
+	{
+		encoder_block()
+		{
+			clear();
+		}
+				
+		uint32_t m_endpoint_predictor; 
+
+		int m_endpoint_index;
+		int m_selector_index;
+
+		int m_selector_history_buf_index;
+
+		bool m_is_cr_target;
+		void clear()
+		{
+			m_endpoint_predictor = 0;
+			
+			m_endpoint_index = 0;
+			m_selector_index = 0;
+						
+			m_selector_history_buf_index = 0;
+			m_is_cr_target = false;
+		}
+	};
+
+	typedef basisu::vector<encoder_block> encoder_block_vec;
+	typedef vector2D<encoder_block> encoder_block_vec2D;
+
+	struct etc1_endpoint_palette_entry
+	{
+		etc1_endpoint_palette_entry()
+		{
+			clear();
+		}
+
+		color_rgba m_color5;
+		uint32_t m_inten5;
+		bool m_color5_valid;
+				
+		void clear()
+		{
+			clear_obj(*this);
+		}
+	};
+
+	typedef basisu::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;
+
+	struct basisu_backend_params
+	{
+		bool m_etc1s;
+		bool m_debug, m_debug_images;
+		float m_endpoint_rdo_quality_thresh;
+		float m_selector_rdo_quality_thresh;
+		uint32_t m_compression_level;
+								
+		bool m_used_global_codebooks;
+
+		bool m_validate;
+
+		basisu_backend_params()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_etc1s = false;
+			m_debug = false;
+			m_debug_images = false;
+			m_endpoint_rdo_quality_thresh = 0.0f;
+			m_selector_rdo_quality_thresh = 0.0f;
+			m_compression_level = 0;
+			m_used_global_codebooks = false;
+			m_validate = true;
+		}
+	};
+
+	struct basisu_backend_slice_desc
+	{
+		basisu_backend_slice_desc()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			clear_obj(*this);
+		}
+
+		uint32_t m_first_block_index;
+
+		uint32_t m_orig_width;
+		uint32_t m_orig_height;
+
+		uint32_t m_width;
+		uint32_t m_height;
+
+		uint32_t m_num_blocks_x;
+		uint32_t m_num_blocks_y;
+
+		uint32_t m_num_macroblocks_x;
+		uint32_t m_num_macroblocks_y;
+
+		uint32_t m_source_file_index;		// also the basis image index
+		uint32_t m_mip_index;
+		bool m_alpha;
+		bool m_iframe;
+	};
+
+	typedef basisu::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
+
+	struct basisu_backend_output
+	{
+		basist::basis_tex_format m_tex_format;
+
+		bool m_etc1s;
+		bool m_uses_global_codebooks;
+		bool m_srgb;
+
+		uint32_t m_num_endpoints;
+		uint32_t m_num_selectors;
+
+		uint8_vec m_endpoint_palette;
+		uint8_vec m_selector_palette;
+
+		basisu_backend_slice_desc_vec m_slice_desc;
+
+		uint8_vec m_slice_image_tables;
+		basisu::vector<uint8_vec> m_slice_image_data;
+		uint16_vec m_slice_image_crcs;
+
+		basisu_backend_output()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_tex_format = basist::basis_tex_format::cETC1S;
+			m_etc1s = false;
+			m_uses_global_codebooks = false;
+			m_srgb = true;
+
+			m_num_endpoints = 0;
+			m_num_selectors = 0;
+
+			m_endpoint_palette.clear();
+			m_selector_palette.clear();
+			m_slice_desc.clear();
+			m_slice_image_tables.clear();
+			m_slice_image_data.clear();
+			m_slice_image_crcs.clear();
+		}
+
+		uint32_t get_output_size_estimate() const
+		{
+			uint32_t total_compressed_bytes = (uint32_t)(m_slice_image_tables.size() + m_endpoint_palette.size() + m_selector_palette.size());
+			for (uint32_t i = 0; i < m_slice_image_data.size(); i++)
+				total_compressed_bytes += (uint32_t)m_slice_image_data[i].size();
+
+			return total_compressed_bytes;
+		}
+	};
+
+	class basisu_backend
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basisu_backend);
+
+	public:
+
+		basisu_backend();
+
+		void clear();
+
+		void init(basisu_frontend *pFront_end, basisu_backend_params &params, const basisu_backend_slice_desc_vec &slice_desc);
+
+		uint32_t encode();
+
+		const basisu_backend_output &get_output() const { return m_output; }
+		const basisu_backend_params& get_params() const { return m_params; }
+
+	private:
+		basisu_frontend *m_pFront_end;
+		basisu_backend_params m_params;
+		basisu_backend_slice_desc_vec m_slices;
+		basisu_backend_output m_output;
+		
+		etc1_endpoint_palette_entry_vec m_endpoint_palette;
+		etc1_selector_palette_entry_vec m_selector_palette;
+
+		struct etc1_global_selector_cb_entry_desc
+		{
+			uint32_t m_pal_index;
+			uint32_t m_mod_index;
+			bool m_was_used;
+		};
+
+		typedef basisu::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;
+
+		etc1_global_selector_cb_entry_desc_vec m_global_selector_palette_desc;
+
+		basisu::vector<encoder_block_vec2D> m_slice_encoder_blocks;
+
+		// Maps OLD to NEW endpoint/selector indices
+		uint_vec m_endpoint_remap_table_old_to_new;
+		uint_vec m_endpoint_remap_table_new_to_old;
+		bool_vec m_old_endpoint_was_used;
+		bool_vec m_new_endpoint_was_used;
+
+		uint_vec m_selector_remap_table_old_to_new;
+
+		// Maps NEW to OLD endpoint/selector indices
+		uint_vec m_selector_remap_table_new_to_old;
+
+		uint32_t get_total_slices() const
+		{
+			return (uint32_t)m_slices.size();
+		}
+
+		uint32_t get_total_slice_blocks() const
+		{
+			return m_pFront_end->get_total_output_blocks();
+		}
+
+		uint32_t get_block_index(uint32_t slice_index, uint32_t block_x, uint32_t block_y) const
+		{
+			const basisu_backend_slice_desc &slice = m_slices[slice_index];
+
+			assert((block_x < slice.m_num_blocks_x) && (block_y < slice.m_num_blocks_y));
+
+			return slice.m_first_block_index + block_y * slice.m_num_blocks_x + block_x;
+		}
+				
+		uint32_t get_total_blocks(uint32_t slice_index) const
+		{
+			return m_slices[slice_index].m_num_blocks_x * m_slices[slice_index].m_num_blocks_y;
+		}
+								
+		uint32_t get_total_blocks() const
+		{
+			uint32_t total_blocks = 0;
+			for (uint32_t i = 0; i < m_slices.size(); i++)
+				total_blocks += get_total_blocks(i);
+			return total_blocks;
+		}
+
+		// Returns the total number of input texels, not counting padding up to blocks/macroblocks.
+		uint32_t get_total_input_texels(uint32_t slice_index) const
+		{
+			return m_slices[slice_index].m_orig_width * m_slices[slice_index].m_orig_height;
+		}
+
+		uint32_t get_total_input_texels() const
+		{
+			uint32_t total_texels = 0;
+			for (uint32_t i = 0; i < m_slices.size(); i++)
+				total_texels += get_total_input_texels(i);
+			return total_texels;
+		}
+
+		int find_slice(uint32_t block_index, uint32_t *pBlock_x, uint32_t *pBlock_y) const
+		{
+			for (uint32_t i = 0; i < m_slices.size(); i++)
+			{
+				if ((block_index >= m_slices[i].m_first_block_index) && (block_index < (m_slices[i].m_first_block_index + m_slices[i].m_num_blocks_x * m_slices[i].m_num_blocks_y)))
+				{
+					const uint32_t ofs = block_index - m_slices[i].m_first_block_index;
+					const uint32_t x = ofs % m_slices[i].m_num_blocks_x;
+					const uint32_t y = ofs / m_slices[i].m_num_blocks_x;
+
+					if (pBlock_x) *pBlock_x = x;
+					if (pBlock_y) *pBlock_y = y;
+
+					return i;
+				}
+			}
+			return -1;
+		}
+
+		void create_endpoint_palette();
+
+		void create_selector_palette();
+
+		// endpoint palette
+		//   5:5:5 and predicted 4:4:4 colors, 1 or 2 3-bit intensity table indices
+		// selector palette
+		//   4x4 2-bit selectors
+
+		// per-macroblock:
+		//  4 diff bits
+		//  4 flip bits
+		//  Endpoint template index, 1-8 endpoint indices
+		//      Alternately, if no template applies, we can send 4 ETC1S bits followed by 4-8 endpoint indices
+		//  4 selector indices
+
+		void reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec &all_endpoint_indices);
+		void sort_selector_codebook();
+		void create_encoder_blocks();
+		void compute_slice_crcs();
+		bool encode_image();
+		bool encode_endpoint_palette();
+		bool encode_selector_palette();
+		int find_video_frame(int slice_index, int delta);
+		void check_for_valid_cr_blocks();
+	};
+
+} // namespace basisu
+
--- a/engine/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
@ -0,0 +1,269 @@
+// basisu_basis_file.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_basis_file.h"
+#include "../transcoder/basisu_transcoder.h"
+
+// The output file version. Keep in sync with BASISD_SUPPORTED_BASIS_VERSION.
+#define BASIS_FILE_VERSION (0x13)
+
+namespace basisu
+{
+	void basisu_file::create_header(const basisu_backend_output &encoder_output, basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame)
+	{
+		m_header.m_header_size = sizeof(basist::basis_file_header);
+
+		m_header.m_data_size = m_total_file_size - sizeof(basist::basis_file_header);
+
+		m_header.m_total_slices = (uint32_t)encoder_output.m_slice_desc.size();
+		
+		m_header.m_total_images = 0;
+		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
+			m_header.m_total_images = maximum<uint32_t>(m_header.m_total_images, encoder_output.m_slice_desc[i].m_source_file_index + 1);
+		
+		m_header.m_tex_format = (int)encoder_output.m_tex_format;
+		m_header.m_flags = 0;
+		
+		if (encoder_output.m_etc1s)
+		{
+			assert(encoder_output.m_tex_format == basist::basis_tex_format::cETC1S);
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagETC1S;
+		}
+		else
+		{
+			assert(encoder_output.m_tex_format != basist::basis_tex_format::cETC1S);
+		}
+
+		if (y_flipped)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagYFlipped;
+		if (encoder_output.m_uses_global_codebooks)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagUsesGlobalCodebook;
+		if (encoder_output.m_srgb)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagSRGB;
+				
+		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
+		{
+			if (encoder_output.m_slice_desc[i].m_alpha)
+			{
+				m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagHasAlphaSlices;
+				break;
+			}
+		}
+
+		m_header.m_tex_type = static_cast<uint8_t>(tex_type);
+		m_header.m_us_per_frame = clamp<uint32_t>(us_per_frame, 0, basist::cBASISMaxUSPerFrame);
+
+		m_header.m_userdata0 = userdata0;
+		m_header.m_userdata1 = userdata1;
+
+		m_header.m_total_endpoints = encoder_output.m_num_endpoints;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_endpoint_cb_file_ofs = m_endpoint_cb_file_ofs;
+			m_header.m_endpoint_cb_file_size = (uint32_t)encoder_output.m_endpoint_palette.size();
+		}
+		else
+		{
+			assert(!m_endpoint_cb_file_ofs);
+		}
+
+		m_header.m_total_selectors = encoder_output.m_num_selectors;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_selector_cb_file_ofs = m_selector_cb_file_ofs;
+			m_header.m_selector_cb_file_size = (uint32_t)encoder_output.m_selector_palette.size();
+		}
+		else
+		{
+			assert(!m_selector_cb_file_ofs);
+		}
+
+		m_header.m_tables_file_ofs = m_tables_file_ofs;
+		m_header.m_tables_file_size = (uint32_t)encoder_output.m_slice_image_tables.size();
+
+		m_header.m_slice_desc_file_ofs = m_slice_descs_file_ofs;
+	}
+
+	bool basisu_file::create_image_descs(const basisu_backend_output &encoder_output)
+	{
+		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;
+
+		m_images_descs.resize(slice_descs.size());
+
+		uint64_t cur_slice_file_ofs = m_first_image_file_ofs;
+		for (uint32_t i = 0; i < slice_descs.size(); i++)
+		{
+			clear_obj(m_images_descs[i]);
+
+			m_images_descs[i].m_image_index = slice_descs[i].m_source_file_index;
+			m_images_descs[i].m_level_index = slice_descs[i].m_mip_index;
+			
+			if (slice_descs[i].m_alpha)
+				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsHasAlpha;
+			if (slice_descs[i].m_iframe)
+				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsFrameIsIFrame;
+
+			m_images_descs[i].m_orig_width = slice_descs[i].m_orig_width;
+			m_images_descs[i].m_orig_height = slice_descs[i].m_orig_height;
+			m_images_descs[i].m_num_blocks_x = slice_descs[i].m_num_blocks_x;
+			m_images_descs[i].m_num_blocks_y = slice_descs[i].m_num_blocks_y;
+			m_images_descs[i].m_slice_data_crc16 = encoder_output.m_slice_image_crcs[i];
+
+			if (encoder_output.m_slice_image_data[i].size() > UINT32_MAX)
+			{
+				error_printf("basisu_file::create_image_descs: Basis file too large\n");
+				return false;
+			}
+
+			const uint32_t image_size = (uint32_t)encoder_output.m_slice_image_data[i].size();
+
+			m_images_descs[i].m_file_ofs = (uint32_t)cur_slice_file_ofs;
+			m_images_descs[i].m_file_size = image_size;
+
+			cur_slice_file_ofs += image_size;
+			if (cur_slice_file_ofs > UINT32_MAX)
+			{
+				error_printf("basisu_file::create_image_descs: Basis file too large\n");
+				return false;
+			}
+		}
+
+		assert(cur_slice_file_ofs == m_total_file_size);
+		return true;
+	}
+
+	void basisu_file::create_comp_data(const basisu_backend_output &encoder_output)
+	{
+		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;
+
+		append_vector(m_comp_data, reinterpret_cast<const uint8_t *>(&m_header), sizeof(m_header));
+
+		assert(m_comp_data.size() == m_slice_descs_file_ofs);
+		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&m_images_descs[0]), m_images_descs.size() * sizeof(m_images_descs[0]));
+
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			if (encoder_output.m_endpoint_palette.size())
+			{
+				assert(m_comp_data.size() == m_endpoint_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_endpoint_palette[0]), encoder_output.m_endpoint_palette.size());
+			}
+
+			if (encoder_output.m_selector_palette.size())
+			{
+				assert(m_comp_data.size() == m_selector_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_selector_palette[0]), encoder_output.m_selector_palette.size());
+			}
+		}
+
+		if (encoder_output.m_slice_image_tables.size())
+		{
+			assert(m_comp_data.size() == m_tables_file_ofs);
+			append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_slice_image_tables[0]), encoder_output.m_slice_image_tables.size());
+		}
+
+		assert(m_comp_data.size() == m_first_image_file_ofs);
+		for (uint32_t i = 0; i < slice_descs.size(); i++)
+			append_vector(m_comp_data, &encoder_output.m_slice_image_data[i][0], encoder_output.m_slice_image_data[i].size());
+
+		assert(m_comp_data.size() == m_total_file_size);
+	}
+
+	void basisu_file::fixup_crcs()
+	{
+		basist::basis_file_header *pHeader = reinterpret_cast<basist::basis_file_header *>(&m_comp_data[0]);
+
+		pHeader->m_data_size = m_total_file_size - sizeof(basist::basis_file_header);
+		pHeader->m_data_crc16 = basist::crc16(&m_comp_data[0] + sizeof(basist::basis_file_header), m_total_file_size - sizeof(basist::basis_file_header), 0);
+				
+		pHeader->m_header_crc16 = basist::crc16(&pHeader->m_data_size, sizeof(basist::basis_file_header) - BASISU_OFFSETOF(basist::basis_file_header, m_data_size), 0);
+
+		pHeader->m_sig = basist::basis_file_header::cBASISSigValue;
+		pHeader->m_ver = BASIS_FILE_VERSION;// basist::basis_file_header::cBASISFirstVersion;
+	}
+
+	bool basisu_file::init(const basisu_backend_output &encoder_output, basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame)
+	{
+		clear();
+
+		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;
+
+		// The Basis file uses 32-bit fields for lots of stuff, so make sure it's not too large.
+		uint64_t check_size = 0;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
+			(uint64_t)encoder_output.m_endpoint_palette.size() + (uint64_t)encoder_output.m_selector_palette.size() + (uint64_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
+				(uint64_t)encoder_output.m_slice_image_tables.size();
+		}
+		if (check_size >= 0xFFFF0000ULL)
+		{
+			error_printf("basisu_file::init: File is too large!\n");
+			return false;
+		}
+
+		m_header_file_ofs = 0;
+		m_slice_descs_file_ofs = sizeof(basist::basis_file_header);
+		if (encoder_output.m_tex_format == basist::basis_tex_format::cETC1S)
+		{
+			if (encoder_output.m_uses_global_codebooks)
+			{
+				m_endpoint_cb_file_ofs = 0;
+				m_selector_cb_file_ofs = 0;
+				m_tables_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+			}
+			else
+			{
+				m_endpoint_cb_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+				m_selector_cb_file_ofs = m_endpoint_cb_file_ofs + (uint32_t)encoder_output.m_endpoint_palette.size();
+				m_tables_file_ofs = m_selector_cb_file_ofs + (uint32_t)encoder_output.m_selector_palette.size();
+			}
+			m_first_image_file_ofs = m_tables_file_ofs + (uint32_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			m_endpoint_cb_file_ofs = 0;
+			m_selector_cb_file_ofs = 0;
+			m_tables_file_ofs = 0;
+			m_first_image_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+		}
+				
+		uint64_t total_file_size = m_first_image_file_ofs;
+		for (uint32_t i = 0; i < encoder_output.m_slice_image_data.size(); i++)
+			total_file_size += encoder_output.m_slice_image_data[i].size();
+		if (total_file_size >= 0xFFFF0000ULL)
+		{
+			error_printf("basisu_file::init: File is too large!\n");
+			return false;
+		}
+
+		m_total_file_size = (uint32_t)total_file_size;
+
+		create_header(encoder_output, tex_type, userdata0, userdata1, y_flipped, us_per_frame);
+
+		if (!create_image_descs(encoder_output))
+			return false;
+
+		create_comp_data(encoder_output);
+
+		fixup_crcs();
+
+		return true;
+	}
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_basis_file.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_basis_file.h
@ -0,0 +1,70 @@
+// basisu_basis_file.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu_file_headers.h"
+#include "basisu_backend.h"
+
+namespace basisu
+{
+	class basisu_file
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basisu_file);
+
+	public:
+		basisu_file()
+		{
+		}
+
+		void clear()
+		{
+			m_comp_data.clear();
+
+			clear_obj(m_header);
+			m_images_descs.clear();
+
+			m_header_file_ofs = 0;
+			m_slice_descs_file_ofs = 0;
+			m_endpoint_cb_file_ofs = 0;
+			m_selector_cb_file_ofs = 0;
+			m_tables_file_ofs = 0;
+			m_first_image_file_ofs = 0;
+			m_total_file_size = 0;
+		}
+
+		bool init(const basisu_backend_output& encoder_output, basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame);
+
+		const uint8_vec &get_compressed_data() const { return m_comp_data; }
+
+	private:
+		basist::basis_file_header m_header;
+		basisu::vector<basist::basis_slice_desc> m_images_descs;
+
+		uint8_vec m_comp_data;
+
+		uint32_t m_header_file_ofs;
+		uint32_t m_slice_descs_file_ofs;
+		uint32_t m_endpoint_cb_file_ofs;
+		uint32_t m_selector_cb_file_ofs;
+		uint32_t m_tables_file_ofs;
+		uint32_t m_first_image_file_ofs;
+		uint32_t m_total_file_size;
+
+		void create_header(const basisu_backend_output& encoder_output,  basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame);
+		bool create_image_descs(const basisu_backend_output& encoder_output);
+		void create_comp_data(const basisu_backend_output& encoder_output);
+		void fixup_crcs();
+	};
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_bc7enc.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_bc7enc.h
@ -0,0 +1,132 @@
+// File: basisu_bc7enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_enc.h"
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+
+#define BC7ENC_MAX_PARTITIONS1 (64)
+#define BC7ENC_MAX_UBER_LEVEL (4)
+
+	typedef uint8_t bc7enc_bool;
+
+#define BC7ENC_TRUE (1)
+#define BC7ENC_FALSE (0)
+		
+	typedef struct { float m_c[4]; } bc7enc_vec4F;
+
+	extern const float g_bc7_weights1x[2 * 4];
+	extern const float g_bc7_weights2x[4 * 4];
+	extern const float g_bc7_weights3x[8 * 4];
+	extern const float g_bc7_weights4x[16 * 4];
+	extern const float g_astc_weights4x[16 * 4];
+	extern const float g_astc_weights5x[32 * 4];
+	extern const float g_astc_weights_3levelsx[3 * 4];
+			
+	extern basist::astc_quant_bin g_astc_sorted_order_unquant[basist::BC7ENC_TOTAL_ASTC_RANGES][256]; // [sorted unquantized order]
+	
+	struct color_cell_compressor_params
+	{
+		uint32_t m_num_pixels;
+		const basist::color_quad_u8* m_pPixels;
+
+		uint32_t m_num_selector_weights;
+		const uint32_t* m_pSelector_weights;
+
+		const bc7enc_vec4F* m_pSelector_weightsx;
+		uint32_t m_comp_bits;
+
+		const uint8_t *m_pForce_selectors;
+
+		// Non-zero m_astc_endpoint_range enables ASTC mode. m_comp_bits and m_has_pbits are always false. We only support 2, 3, or 4 bit weight encodings.
+		uint32_t m_astc_endpoint_range;
+
+		uint32_t m_weights[4];
+		bc7enc_bool m_has_alpha;
+		bc7enc_bool m_has_pbits;
+		bc7enc_bool m_endpoints_share_pbit;
+		bc7enc_bool m_perceptual;
+	};
+
+	struct color_cell_compressor_results
+	{
+		uint64_t m_best_overall_err;
+		basist::color_quad_u8 m_low_endpoint;
+		basist::color_quad_u8 m_high_endpoint;
+		uint32_t m_pbits[2];
+		uint8_t* m_pSelectors;
+		uint8_t* m_pSelectors_temp;
+
+		// Encoded ASTC indices, if ASTC mode is enabled
+		basist::color_quad_u8 m_astc_low_endpoint;
+		basist::color_quad_u8 m_astc_high_endpoint;
+	};
+
+	struct bc7enc_compress_block_params
+	{
+		// m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_max_partitions_mode1;
+
+		// Relative RGBA or YCbCrA weights.
+		uint32_t m_weights[4];
+
+		// m_uber_level may range from 0 to BC7ENC_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_uber_level;
+
+		// If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB.
+		bc7enc_bool m_perceptual;
+
+		uint32_t m_least_squares_passes;
+	};
+
+	uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params* pParams, color_cell_compressor_results* pResults, const bc7enc_compress_block_params* pComp_params);
+		
+	uint64_t color_cell_compression_est_astc(
+		uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeight_table,
+		uint32_t num_pixels, const basist::color_quad_u8* pPixels,
+		uint64_t best_err_so_far, const uint32_t weights[4]);
+		
+	inline void bc7enc_compress_block_params_init_linear_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_FALSE;
+		p->m_weights[0] = 1;
+		p->m_weights[1] = 1;
+		p->m_weights[2] = 1;
+		p->m_weights[3] = 1;
+	}
+
+	inline void bc7enc_compress_block_params_init_perceptual_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_TRUE;
+		p->m_weights[0] = 128;
+		p->m_weights[1] = 64;
+		p->m_weights[2] = 16;
+		p->m_weights[3] = 32;
+	}
+
+	inline void bc7enc_compress_block_params_init(bc7enc_compress_block_params* p)
+	{
+		p->m_max_partitions_mode1 = BC7ENC_MAX_PARTITIONS1;
+		p->m_least_squares_passes = 1;
+		p->m_uber_level = 0;
+		bc7enc_compress_block_params_init_perceptual_weights(p);
+	}
+
+	// bc7enc_compress_block_init() MUST be called before calling bc7enc_compress_block() (or you'll get artifacts).
+	void bc7enc_compress_block_init();
+				
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_comp.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_comp.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_comp.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_comp.h
@ -0,0 +1,663 @@
+// basisu_comp.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_frontend.h"
+#include "basisu_backend.h"
+#include "basisu_basis_file.h"
+#include "../transcoder/basisu_transcoder.h"
+#include "basisu_uastc_enc.h"
+
+#define BASISU_LIB_VERSION 116
+#define BASISU_LIB_VERSION_STRING "1.16"
+
+#ifndef BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 is undefined
+#endif
+#ifndef BASISD_SUPPORT_KTX2_ZSTD
+	#error BASISD_SUPPORT_KTX2_ZSTD is undefined
+#endif
+
+#if !BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 must be enabled when building the encoder. To reduce code size if KTX2 support is not needed, set BASISD_SUPPORT_KTX2_ZSTD to 0
+#endif
+
+namespace basisu
+{
+	struct opencl_context;
+	typedef opencl_context* opencl_context_ptr;
+
+	const uint32_t BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION = 16384;
+
+	// Allow block's color distance to increase by 1.5 while searching for an alternative nearby endpoint.
+	const float BASISU_DEFAULT_ENDPOINT_RDO_THRESH = 1.5f; 
+	
+	// Allow block's color distance to increase by 1.25 while searching the selector history buffer for a close enough match.
+	const float BASISU_DEFAULT_SELECTOR_RDO_THRESH = 1.25f; 
+
+	const int BASISU_DEFAULT_QUALITY = 128;
+	const float BASISU_DEFAULT_HYBRID_SEL_CB_QUALITY_THRESH = 2.0f;
+
+	const uint32_t BASISU_MAX_IMAGE_DIMENSION = 16384;
+	const uint32_t BASISU_QUALITY_MIN = 1;
+	const uint32_t BASISU_QUALITY_MAX = 255;
+	const uint32_t BASISU_MAX_ENDPOINT_CLUSTERS = basisu_frontend::cMaxEndpointClusters;
+	const uint32_t BASISU_MAX_SELECTOR_CLUSTERS = basisu_frontend::cMaxSelectorClusters;
+
+	const uint32_t BASISU_MAX_SLICES = 0xFFFFFF;
+
+	const int BASISU_RDO_UASTC_DICT_SIZE_DEFAULT = 4096; // 32768;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MIN = 64;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MAX = 65536;
+
+	struct image_stats
+	{
+		image_stats()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_filename.clear();
+			m_width = 0;
+			m_height = 0;
+						
+			m_basis_rgb_avg_psnr = 0.0f;
+			m_basis_rgba_avg_psnr = 0.0f;
+			m_basis_a_avg_psnr = 0.0f;
+			m_basis_luma_709_psnr = 0.0f;
+			m_basis_luma_601_psnr = 0.0f;
+			m_basis_luma_709_ssim = 0.0f;
+
+			m_bc7_rgb_avg_psnr = 0.0f;
+			m_bc7_rgba_avg_psnr = 0.0f;
+			m_bc7_a_avg_psnr = 0.0f;
+			m_bc7_luma_709_psnr = 0.0f;
+			m_bc7_luma_601_psnr = 0.0f;
+			m_bc7_luma_709_ssim = 0.0f;
+						
+			m_best_etc1s_rgb_avg_psnr = 0.0f;
+			m_best_etc1s_luma_709_psnr = 0.0f;
+			m_best_etc1s_luma_601_psnr = 0.0f;
+			m_best_etc1s_luma_709_ssim = 0.0f;
+
+			m_opencl_failed = false;
+		}
+
+		std::string m_filename;
+		uint32_t m_width;
+		uint32_t m_height;
+
+		// .basis compressed (ETC1S or UASTC statistics)
+		float m_basis_rgb_avg_psnr;
+		float m_basis_rgba_avg_psnr;
+		float m_basis_a_avg_psnr;
+		float m_basis_luma_709_psnr;
+		float m_basis_luma_601_psnr;
+		float m_basis_luma_709_ssim;
+
+		// BC7 statistics
+		float m_bc7_rgb_avg_psnr;
+		float m_bc7_rgba_avg_psnr;
+		float m_bc7_a_avg_psnr;
+		float m_bc7_luma_709_psnr;
+		float m_bc7_luma_601_psnr;
+		float m_bc7_luma_709_ssim;
+		
+		// Highest achievable quality ETC1S statistics
+		float m_best_etc1s_rgb_avg_psnr;
+		float m_best_etc1s_luma_709_psnr;
+		float m_best_etc1s_luma_601_psnr;
+		float m_best_etc1s_luma_709_ssim;
+
+		bool m_opencl_failed;
+	};
+
+	template<bool def>
+	struct bool_param
+	{
+		bool_param() :
+			m_value(def),
+			m_changed(false)
+		{
+		}
+
+		void clear()
+		{
+			m_value = def;
+			m_changed = false;
+		}
+
+		operator bool() const
+		{
+			return m_value;
+		}
+
+		bool operator= (bool v)
+		{
+			m_value = v;
+			m_changed = true;
+			return m_value;
+		}
+
+		bool was_changed() const { return m_changed; }
+		void set_changed(bool flag) { m_changed = flag; }
+
+		bool m_value;
+		bool m_changed;
+	};
+
+	template<typename T>
+	struct param
+	{
+		param(T def, T min_v, T max_v) :
+			m_value(def),
+			m_def(def),
+			m_min(min_v),
+			m_max(max_v),
+			m_changed(false)
+		{
+		}
+
+		void clear()
+		{
+			m_value = m_def;
+			m_changed = false;
+		}
+
+		operator T() const
+		{
+			return m_value;
+		}
+
+		T operator= (T v)
+		{
+			m_value = clamp<T>(v, m_min, m_max);
+			m_changed = true;
+			return m_value;
+		}
+
+		T operator *= (T v)
+		{
+			m_value *= v;
+			m_changed = true;
+			return m_value;
+		}
+
+		bool was_changed() const { return m_changed; }
+		void set_changed(bool flag) { m_changed = flag; }
+
+		T m_value;
+		T m_def;
+		T m_min;
+		T m_max;
+		bool m_changed;
+	};
+
+	struct basis_compressor_params
+	{
+		basis_compressor_params() :
+			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL),
+			m_selector_rdo_thresh(BASISU_DEFAULT_SELECTOR_RDO_THRESH, 0.0f, 1e+10f),
+			m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f),
+			m_mip_scale(1.0f, .000125f, 4.0f),
+			m_mip_smallest_dimension(1, 1, 16384),
+			m_max_endpoint_clusters(512),
+			m_max_selector_clusters(512),
+			m_quality_level(-1),
+			m_pack_uastc_flags(cPackUASTCLevelDefault),
+			m_rdo_uastc_quality_scalar(1.0f, 0.001f, 50.0f),
+			m_rdo_uastc_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX),
+			m_rdo_uastc_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f),
+			m_rdo_uastc_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f),
+			m_rdo_uastc_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f),
+			m_rdo_uastc_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f),
+			m_resample_width(0, 1, 16384),
+			m_resample_height(0, 1, 16384),
+			m_resample_factor(0.0f, .00125f, 100.0f),
+			m_ktx2_uastc_supercompression(basist::KTX2_SS_NONE),
+			m_ktx2_zstd_supercompression_level(6, INT_MIN, INT_MAX),
+			m_pJob_pool(nullptr)
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_uastc.clear();
+			m_use_opencl.clear();
+			m_status_output.clear();
+
+			m_source_filenames.clear();
+			m_source_alpha_filenames.clear();
+
+			m_source_images.clear();
+			m_source_mipmap_images.clear();
+
+			m_out_filename.clear();
+
+			m_y_flip.clear();
+			m_debug.clear();
+			m_validate_etc1s.clear();
+			m_debug_images.clear();
+			m_perceptual.clear();
+			m_no_selector_rdo.clear();
+			m_selector_rdo_thresh.clear();
+			m_read_source_images.clear();
+			m_write_output_basis_files.clear();
+			m_compression_level.clear();
+			m_compute_stats.clear();
+			m_print_stats.clear();
+			m_check_for_alpha.clear();
+			m_force_alpha.clear();
+			m_multithreading.clear();
+			m_swizzle[0] = 0;
+			m_swizzle[1] = 1;
+			m_swizzle[2] = 2;
+			m_swizzle[3] = 3;
+			m_renormalize.clear();
+			m_disable_hierarchical_endpoint_codebooks.clear();
+
+			m_no_endpoint_rdo.clear();
+			m_endpoint_rdo_thresh.clear();
+						
+			m_mip_gen.clear();
+			m_mip_scale.clear();
+			m_mip_filter = "kaiser";
+			m_mip_scale = 1.0f;
+			m_mip_srgb.clear();
+			m_mip_premultiplied.clear();
+			m_mip_renormalize.clear();
+			m_mip_wrapping.clear();
+			m_mip_fast.clear();
+			m_mip_smallest_dimension.clear();
+
+			m_max_endpoint_clusters = 0;
+			m_max_selector_clusters = 0;
+			m_quality_level = -1;
+
+			m_tex_type = basist::cBASISTexType2D;
+			m_userdata0 = 0;
+			m_userdata1 = 0;
+			m_us_per_frame = 0;
+
+			m_pack_uastc_flags = cPackUASTCLevelDefault;
+			m_rdo_uastc.clear();
+			m_rdo_uastc_quality_scalar.clear();
+			m_rdo_uastc_max_smooth_block_error_scale.clear();
+			m_rdo_uastc_smooth_block_max_std_dev.clear();
+			m_rdo_uastc_max_allowed_rms_increase_ratio.clear();
+			m_rdo_uastc_skip_block_rms_thresh.clear();
+			m_rdo_uastc_favor_simpler_modes_in_rdo_mode.clear();
+			m_rdo_uastc_multithreading.clear();
+
+			m_resample_width.clear();
+			m_resample_height.clear();
+			m_resample_factor.clear();
+
+			m_pGlobal_codebooks = nullptr;
+
+			m_create_ktx2_file.clear();
+			m_ktx2_uastc_supercompression = basist::KTX2_SS_NONE;
+			m_ktx2_key_values.clear();
+			m_ktx2_zstd_supercompression_level.clear();
+			m_ktx2_srgb_transfer_func.clear();
+
+			m_validate_output_data.clear();
+
+			m_pJob_pool = nullptr;
+		}
+						
+		// True to generate UASTC .basis file data, otherwise ETC1S.
+		bool_param<false> m_uastc;
+
+		bool_param<false> m_use_opencl;
+
+		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG images to read. 
+		// Otherwise, the compressor processes the images in m_source_images.
+		basisu::vector<std::string> m_source_filenames;
+		basisu::vector<std::string> m_source_alpha_filenames;
+		
+		basisu::vector<image> m_source_images;
+		
+		// Stores mipmaps starting from level 1. Level 0 is still stored in m_source_images, as usual.
+		// If m_source_mipmaps isn't empty, automatic mipmap generation isn't done. m_source_mipmaps.size() MUST equal m_source_images.size() or the compressor returns an error.
+		// The compressor applies the user-provided swizzling (in m_swizzle) to these images.
+		basisu::vector< basisu::vector<image> > m_source_mipmap_images;
+						
+		// Filename of the output basis file
+		std::string m_out_filename;
+
+		// The params are done this way so we can detect when the user has explictly changed them.
+
+		// Flip images across Y axis
+		bool_param<false> m_y_flip;
+
+		// If true, the compressor will print basis status to stdout during compression.
+		bool_param<true> m_status_output;
+		
+		// Output debug information during compression
+		bool_param<false> m_debug;
+		bool_param<false> m_validate_etc1s;
+		
+		// m_debug_images is pretty slow
+		bool_param<false> m_debug_images;
+
+		// ETC1S compression level, from 0 to BASISU_MAX_COMPRESSION_LEVEL (higher is slower). 
+		// This parameter controls numerous internal encoding speed vs. compression efficiency/performance tradeoffs.
+		// Note this is NOT the same as the ETC1S quality level, and most users shouldn't change this.
+		param<int> m_compression_level;
+						
+		// Use perceptual sRGB colorspace metrics instead of linear
+		bool_param<true> m_perceptual;
+
+		// Disable selector RDO, for faster compression but larger files
+		bool_param<false> m_no_selector_rdo;
+		param<float> m_selector_rdo_thresh;
+
+		bool_param<false> m_no_endpoint_rdo;
+		param<float> m_endpoint_rdo_thresh;
+
+		// Read source images from m_source_filenames/m_source_alpha_filenames
+		bool_param<false> m_read_source_images;
+
+		// Write the output basis file to disk using m_out_filename
+		bool_param<false> m_write_output_basis_files;
+								
+		// Compute and display image metrics 
+		bool_param<false> m_compute_stats;
+
+		// Print stats to stdout, if m_compute_stats is true.
+		bool_param<true> m_print_stats;
+		
+		// Check to see if any input image has an alpha channel, if so then the output basis file will have alpha channels
+		bool_param<true> m_check_for_alpha;
+		
+		// Always put alpha slices in the output basis file, even when the input doesn't have alpha
+		bool_param<false> m_force_alpha; 
+		bool_param<true> m_multithreading;
+		
+		// Split the R channel to RGB and the G channel to alpha, then write a basis file with alpha channels
+		char m_swizzle[4];
+
+		bool_param<false> m_renormalize;
+
+		// If true the front end will not use 2 level endpoint codebook searching, for slightly higher quality but much slower execution.
+		// Note some m_compression_level's disable this automatically.
+		bool_param<false> m_disable_hierarchical_endpoint_codebooks;
+						
+		// mipmap generation parameters
+		bool_param<false> m_mip_gen;
+		param<float> m_mip_scale;
+		std::string m_mip_filter;
+		bool_param<false> m_mip_srgb;
+		bool_param<true> m_mip_premultiplied; // not currently supported
+		bool_param<false> m_mip_renormalize; 
+		bool_param<true> m_mip_wrapping;
+		bool_param<true> m_mip_fast;
+		param<int> m_mip_smallest_dimension;
+						
+		// Codebook size (quality) control. 
+		// If m_quality_level != -1, it controls the quality level. It ranges from [1,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX].
+		// Otherwise m_max_endpoint_clusters/m_max_selector_clusters controls the codebook sizes directly.
+		uint32_t m_max_endpoint_clusters;
+		uint32_t m_max_selector_clusters;
+		int m_quality_level;
+		
+		// m_tex_type, m_userdata0, m_userdata1, m_framerate - These fields go directly into the Basis file header.
+		basist::basis_texture_type m_tex_type;
+		uint32_t m_userdata0;
+		uint32_t m_userdata1;
+		uint32_t m_us_per_frame;
+
+		// cPackUASTCLevelDefault, etc.
+		uint32_t m_pack_uastc_flags;
+		bool_param<false> m_rdo_uastc;
+		param<float> m_rdo_uastc_quality_scalar;
+		param<int> m_rdo_uastc_dict_size;
+		param<float> m_rdo_uastc_max_smooth_block_error_scale;
+		param<float> m_rdo_uastc_smooth_block_max_std_dev;
+		param<float> m_rdo_uastc_max_allowed_rms_increase_ratio;
+		param<float> m_rdo_uastc_skip_block_rms_thresh;
+		bool_param<true> m_rdo_uastc_favor_simpler_modes_in_rdo_mode;
+		bool_param<true> m_rdo_uastc_multithreading;
+
+		param<int> m_resample_width;
+		param<int> m_resample_height;
+		param<float> m_resample_factor;
+
+		const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
+
+		// KTX2 specific parameters.
+		// Internally, the compressor always creates a .basis file then it converts that lossless to KTX2.
+		bool_param<false> m_create_ktx2_file;
+		basist::ktx2_supercompression m_ktx2_uastc_supercompression;
+		basist::ktx2_transcoder::key_value_vec m_ktx2_key_values;
+		param<int> m_ktx2_zstd_supercompression_level;
+		bool_param<false> m_ktx2_srgb_transfer_func;
+
+		bool_param<false> m_validate_output_data;
+
+		job_pool *m_pJob_pool;
+	};
+
+	// Important: basisu_encoder_init() MUST be called first before using this class.
+	class basis_compressor
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basis_compressor);
+
+	public:
+		basis_compressor();
+		~basis_compressor();
+
+		// Note it *should* be possible to call init() multiple times with different inputs, but this scenario isn't well tested. Ideally, create 1 object, compress, then delete it.
+		bool init(const basis_compressor_params &params);
+		
+		enum error_code
+		{
+			cECSuccess = 0,
+			cECFailedInitializing,
+			cECFailedReadingSourceImages,
+			cECFailedValidating,
+			cECFailedEncodeUASTC,
+			cECFailedFrontEnd,
+			cECFailedFontendExtract,
+			cECFailedBackend,
+			cECFailedCreateBasisFile,
+			cECFailedWritingOutput,
+			cECFailedUASTCRDOPostProcess,
+			cECFailedCreateKTX2File
+		};
+
+		error_code process();
+
+		// The output .basis file will always be valid of process() succeeded.
+		const uint8_vec &get_output_basis_file() const { return m_output_basis_file; }
+		
+		// The output .ktx2 file will only be valid if m_create_ktx2_file was true and process() succeeded.
+		const uint8_vec& get_output_ktx2_file() const { return m_output_ktx2_file; }
+
+		const basisu::vector<image_stats> &get_stats() const { return m_stats; }
+
+		uint32_t get_basis_file_size() const { return m_basis_file_size; }
+		double get_basis_bits_per_texel() const { return m_basis_bits_per_texel; }
+		
+		bool get_any_source_image_has_alpha() const { return m_any_source_image_has_alpha; }
+
+		bool get_opencl_failed() const { return m_opencl_failed; }
+								
+	private:
+		basis_compressor_params m_params;
+
+		opencl_context_ptr m_pOpenCL_context;
+		
+		basisu::vector<image> m_slice_images;
+
+		basisu::vector<image_stats> m_stats;
+
+		uint32_t m_basis_file_size;
+		double m_basis_bits_per_texel;
+						
+		basisu_backend_slice_desc_vec m_slice_descs;
+
+		uint32_t m_total_blocks;
+		
+		basisu_frontend m_frontend;
+		pixel_block_vec m_source_blocks;
+
+		basisu::vector<gpu_image> m_frontend_output_textures;
+
+		basisu::vector<gpu_image> m_best_etc1s_images;
+		basisu::vector<image> m_best_etc1s_images_unpacked;
+
+		basisu_backend m_backend;
+
+		basisu_file m_basis_file;
+
+		basisu::vector<gpu_image> m_decoded_output_textures;
+		basisu::vector<image> m_decoded_output_textures_unpacked;
+		basisu::vector<gpu_image> m_decoded_output_textures_bc7;
+		basisu::vector<image> m_decoded_output_textures_unpacked_bc7;
+
+		uint8_vec m_output_basis_file;
+		uint8_vec m_output_ktx2_file;
+		
+		basisu::vector<gpu_image> m_uastc_slice_textures;
+		basisu_backend_output m_uastc_backend_output;
+
+		bool m_any_source_image_has_alpha;
+
+		bool m_opencl_failed;
+
+		bool read_source_images();
+		bool extract_source_blocks();
+		bool process_frontend();
+		bool extract_frontend_texture_data();
+		bool process_backend();
+		bool create_basis_file_and_transcode();
+		bool write_output_files_and_compute_stats();
+		error_code encode_slices_to_uastc();
+		bool generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha);
+		bool validate_texture_type_constraints();
+		bool validate_ktx2_constraints();
+		void get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr);
+		bool create_ktx2_file();
+	};
+				
+	// Alternative simple C-style wrapper API around the basis_compressor class. 
+	// This doesn't expose every encoder feature, but it's enough to get going.
+	// Important: basisu_encoder_init() MUST be called first before calling these functions.
+	//
+	// Input parameters:
+	//   source_images: Array of "image" objects, one per mipmap level, largest mipmap level first.
+	// OR
+	//   pImageRGBA: pointer to a 32-bpp RGBx or RGBA raster image, R first in memory, A last. Top scanline first in memory.
+	//   width/height/pitch_in_pixels: dimensions of pImageRGBA
+	//   
+	// flags_and_quality: Combination of the above flags logically OR'd with the ETC1S or UASTC level, i.e. "cFlagSRGB | cFlagGenMipsClamp | cFlagThreaded | 128" or "cFlagSRGB | cFlagGenMipsClamp | cFlagUASTC | cFlagThreaded | cPackUASTCLevelDefault".
+	//	  In ETC1S mode, the lower 8-bits are the ETC1S quality level which ranges from [1,255] (higher=better quality/larger files)
+	//	  In UASTC mode, the lower 8-bits are the UASTC pack level (see cPackUASTCLevelFastest, etc.). Fastest/lowest quality is 0, so be sure to set it correctly. 
+	// 
+	// uastc_rdo_quality: Float UASTC RDO quality level (0=no change, higher values lower quality but increase compressibility, initially try .5-1.5)
+	// 
+	// pSize: Returns the output data's compressed size in bytes
+	// 
+	// Return value is the compressed .basis or .ktx2 file data, or nullptr on failure. Must call basis_free() to free it.
+	enum
+	{
+		cFlagUseOpenCL = 1 << 8,		// use OpenCL if available
+		cFlagThreaded = 1 << 9,			// use multiple threads for compression
+		cFlagDebug = 1 << 10,			// enable debug output
+
+		cFlagKTX2 = 1 << 11,			// generate a KTX2 file
+		cFlagKTX2UASTCSuperCompression = 1 << 12, // use KTX2 Zstd supercompression on UASTC files
+
+		cFlagSRGB = 1 << 13,			// input texture is sRGB, use perceptual colorspace metrics, also use sRGB filtering during mipmap gen, and also sets KTX2 output transfer func to sRGB
+		cFlagGenMipsClamp = 1 << 14,  // generate mipmaps with clamp addressing
+		cFlagGenMipsWrap = 1 << 15,  // generate mipmaps with wrap addressing
+		
+		cFlagYFlip = 1 << 16,		// flip source image on Y axis before compression
+		
+		cFlagUASTC = 1 << 17,		// use UASTC compression vs. ETC1S
+		cFlagUASTCRDO = 1 << 18,		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
+		
+		cFlagPrintStats = 1 << 19,	// print image stats to stdout
+		cFlagPrintStatus = 1 << 20	// print status to stdout
+	};
+
+	// This function accepts an array of source images. 
+	// If more than one image is provided, it's assumed the images form a mipmap pyramid and automatic mipmap generation is disabled.
+	// Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. The returned block must be freed using basis_free_data().
+	// basisu_encoder_init() MUST be called first!
+	void* basis_compress(
+		const basisu::vector<image> &source_images,
+		uint32_t flags_and_quality, float uastc_rdo_quality,
+		size_t* pSize,
+		image_stats* pStats = nullptr);
+
+	// This function only accepts a single source image.
+	void* basis_compress(
+		const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels,
+		uint32_t flags_and_quality, float uastc_rdo_quality,
+		size_t* pSize,
+		image_stats* pStats = nullptr);
+
+	// Frees the dynamically allocated file data returned by basis_compress().
+	void basis_free_data(void* p);
+
+	// Runs a short benchmark using synthetic image data to time OpenCL encoding vs. CPU encoding, with multithreading enabled.
+	// Returns true if opencl is worth using on this system, otherwise false.
+	// If pOpenCL_failed is not null, it will be set to true if OpenCL encoding failed *on this particular machine/driver/BasisU version* and the encoder falled back to CPU encoding.
+	// basisu_encoder_init() MUST be called first. If OpenCL support wasn't enabled this always returns false.
+	bool basis_benchmark_etc1s_opencl(bool *pOpenCL_failed = nullptr);
+
+	// Parallel compression API
+	struct parallel_results
+	{
+		double m_total_time;
+		basis_compressor::error_code m_error_code;
+		uint8_vec m_basis_file;
+		uint8_vec m_ktx2_file;
+		basisu::vector<image_stats> m_stats;
+		double m_basis_bits_per_texel;
+		bool m_any_source_image_has_alpha;
+
+		parallel_results() 
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_total_time = 0.0f;
+			m_error_code = basis_compressor::cECFailedInitializing;
+			m_basis_file.clear();
+			m_ktx2_file.clear();
+			m_stats.clear();
+			m_basis_bits_per_texel = 0.0f;
+			m_any_source_image_has_alpha = false;
+		}
+	};
+		
+	// Compresses an array of input textures across total_threads threads using the basis_compressor class.
+	// Compressing multiple textures at a time is substantially more efficient than just compressing one at a time.
+	// total_threads must be >= 1.
+	bool basis_parallel_compress(
+		uint32_t total_threads,
+		const basisu::vector<basis_compressor_params> &params_vec,
+		basisu::vector< parallel_results > &results_vec);
+		
+} // namespace basisu
+
--- a/engine/thirdparty/basis_universal/encoder/basisu_enc.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_enc.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_enc.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_enc.h
--- a/engine/thirdparty/basis_universal/encoder/basisu_etc.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_etc.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_etc.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_etc.h
--- a/engine/thirdparty/basis_universal/encoder/basisu_frontend.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_frontend.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_frontend.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_frontend.h
@ -0,0 +1,353 @@
+// basisu_frontend.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_enc.h"
+#include "basisu_etc.h"
+#include "basisu_gpu_texture.h"
+#include "../transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_transcoder.h"
+
+namespace basisu
+{
+	struct opencl_context;
+	typedef opencl_context* opencl_context_ptr;
+
+	struct vec2U
+	{
+		uint32_t m_comps[2];
+
+		vec2U() { }
+		vec2U(uint32_t a, uint32_t b) { set(a, b); }
+
+		void set(uint32_t a, uint32_t b) { m_comps[0] = a; m_comps[1] = b; }
+
+		uint32_t operator[] (uint32_t i) const { assert(i < 2); return m_comps[i]; }
+		uint32_t &operator[] (uint32_t i) { assert(i < 2); return m_comps[i]; }
+	};
+
+	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 2;
+	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 6;
+
+	class basisu_frontend
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basisu_frontend);
+
+	public:
+
+		basisu_frontend() :
+			m_total_blocks(0),
+			m_total_pixels(0),
+			m_endpoint_refinement(false),
+			m_use_hierarchical_endpoint_codebooks(false),
+			m_use_hierarchical_selector_codebooks(false),
+			m_num_endpoint_codebook_iterations(0),
+			m_num_selector_codebook_iterations(0),
+			m_opencl_failed(false)
+		{
+		}
+
+		enum
+		{
+			cMaxEndpointClusters = 16128,
+						
+			cMaxSelectorClusters = 16128,
+		};
+
+		struct params
+		{
+			params() :
+				m_num_source_blocks(0),
+				m_pSource_blocks(NULL),
+				m_max_endpoint_clusters(256),
+				m_max_selector_clusters(256),
+				m_compression_level(BASISU_DEFAULT_COMPRESSION_LEVEL),
+				m_perceptual(true),
+				m_debug_stats(false),
+				m_debug_images(false),
+				m_dump_endpoint_clusterization(true),
+				m_validate(false),
+				m_multithreaded(false),
+				m_disable_hierarchical_endpoint_codebooks(false),
+				m_tex_type(basist::cBASISTexType2D),
+				m_pOpenCL_context(nullptr),
+				m_pJob_pool(nullptr)
+			{
+			}
+
+			uint32_t m_num_source_blocks;
+			pixel_block *m_pSource_blocks;
+
+			uint32_t m_max_endpoint_clusters;
+			uint32_t m_max_selector_clusters;
+
+			uint32_t m_compression_level;
+
+			bool m_perceptual;
+			bool m_debug_stats;
+			bool m_debug_images;
+			bool m_dump_endpoint_clusterization;
+			bool m_validate;
+			bool m_multithreaded;
+			bool m_disable_hierarchical_endpoint_codebooks;
+			
+			basist::basis_texture_type m_tex_type;
+			const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
+						
+			opencl_context_ptr m_pOpenCL_context;
+			
+			job_pool *m_pJob_pool;
+		};
+
+		bool init(const params &p);
+
+		bool compress();
+
+		const params &get_params() const { return m_params; }
+
+		const pixel_block &get_source_pixel_block(uint32_t i) const { return m_source_blocks[i]; }
+
+		// RDO output blocks
+		uint32_t get_total_output_blocks() const { return static_cast<uint32_t>(m_encoded_blocks.size()); }
+
+		const etc_block &get_output_block(uint32_t block_index) const { return m_encoded_blocks[block_index]; }
+		const etc_block_vec &get_output_blocks() const { return m_encoded_blocks; }
+
+		// "Best" ETC1S blocks
+		const etc_block &get_etc1s_block(uint32_t block_index) const { return m_etc1_blocks_etc1s[block_index]; }
+
+		// Per-block flags
+		bool get_diff_flag(uint32_t block_index) const { return m_encoded_blocks[block_index].get_diff_bit(); }
+
+		// Endpoint clusters
+		uint32_t get_total_endpoint_clusters() const { return static_cast<uint32_t>(m_endpoint_clusters.size()); }
+		uint32_t get_subblock_endpoint_cluster_index(uint32_t block_index, uint32_t subblock_index) const { return m_block_endpoint_clusters_indices[block_index][subblock_index]; }
+
+		const color_rgba &get_endpoint_cluster_unscaled_color(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_color_unscaled[individual_mode]; }
+		uint32_t get_endpoint_cluster_inten_table(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_inten_table[individual_mode]; }
+
+		bool get_endpoint_cluster_color_is_used(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_color_used[individual_mode]; }
+
+		// Selector clusters
+		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_block_indices.size()); }
+		uint32_t get_block_selector_cluster_index(uint32_t block_index) const { return m_block_selector_cluster_index[block_index]; }
+		const etc_block &get_selector_cluster_selector_bits(uint32_t cluster_index) const { return m_optimized_cluster_selectors[cluster_index]; }
+				
+		// Returns block indices using each selector cluster
+		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_block_indices[selector_cluster_index]; }
+
+		void dump_debug_image(const char *pFilename, uint32_t first_block, uint32_t num_blocks_x, uint32_t num_blocks_y, bool output_blocks);
+		
+		void reoptimize_remapped_endpoints(const uint_vec &new_block_endpoints, int_vec &old_to_new_endpoint_cluster_indices, bool optimize_final_codebook, uint_vec *pBlock_selector_indices = nullptr);
+
+		bool get_opencl_failed() const { return m_opencl_failed; }
+
+	private:
+		params m_params;
+		uint32_t m_total_blocks;
+		uint32_t m_total_pixels;
+
+		bool m_endpoint_refinement;
+		bool m_use_hierarchical_endpoint_codebooks;
+		bool m_use_hierarchical_selector_codebooks;
+
+		uint32_t m_num_endpoint_codebook_iterations;
+		uint32_t m_num_selector_codebook_iterations;
+
+		// Source pixels for each blocks
+		pixel_block_vec m_source_blocks;
+
+		// The quantized ETC1S texture.
+		etc_block_vec m_encoded_blocks;
+		
+		// Quantized blocks after endpoint quant, but before selector quant
+		etc_block_vec m_orig_encoded_blocks; 
+				
+		// Full quality ETC1S texture
+		etc_block_vec m_etc1_blocks_etc1s;
+				
+		typedef vec<6, float> vec6F;
+		
+		// Endpoint clusterizer
+		typedef tree_vector_quant<vec6F> vec6F_quantizer;
+		vec6F_quantizer m_endpoint_clusterizer;
+
+		// For each endpoint cluster: An array of which subblock indices (block_index*2+subblock) are located in that cluster.
+		basisu::vector<uint_vec> m_endpoint_clusters;
+
+		// Array of subblock indices for each parent endpoint cluster
+		// Note: Initially, each endpoint cluster will only live in a single parent cluster, in a shallow tree. 
+		// As the endpoint clusters are manipulated this constraint gets broken.
+		basisu::vector<uint_vec> m_endpoint_parent_clusters;
+		
+		// Each block's parent endpoint cluster index
+		uint8_vec m_block_parent_endpoint_cluster; 
+
+		// Array of endpoint cluster indices for each parent endpoint cluster
+		basisu::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster;
+				
+		struct endpoint_cluster_etc_params
+		{
+			endpoint_cluster_etc_params()
+			{
+				clear();
+			}
+
+			void clear()
+			{
+				clear_obj(m_color_unscaled);
+				clear_obj(m_inten_table);
+				clear_obj(m_color_error);
+				m_subblocks.clear();
+
+				clear_obj(m_color_used);
+				m_valid = false;
+			}
+
+			// TODO: basisu doesn't use individual mode.
+			color_rgba m_color_unscaled[2]; // [use_individual_mode]
+			uint32_t m_inten_table[2];
+
+			uint64_t m_color_error[2];
+
+			uint_vec m_subblocks;
+
+			bool m_color_used[2];
+
+			bool m_valid;
+
+			bool operator== (const endpoint_cluster_etc_params &other) const
+			{
+				for (uint32_t i = 0; i < 2; i++)
+				{
+					if (m_color_unscaled[i] != other.m_color_unscaled[i])
+						return false;
+				}
+
+				if (m_inten_table[0] != other.m_inten_table[0])
+					return false;
+				if (m_inten_table[1] != other.m_inten_table[1])
+					return false;
+
+				return true;
+			}
+
+			bool operator< (const endpoint_cluster_etc_params &other) const
+			{
+				for (uint32_t i = 0; i < 2; i++)
+				{
+					if (m_color_unscaled[i] < other.m_color_unscaled[i])
+						return true;
+					else if (m_color_unscaled[i] != other.m_color_unscaled[i])
+						return false;
+				}
+
+				if (m_inten_table[0] < other.m_inten_table[0])
+					return true;
+				else if (m_inten_table[0] == other.m_inten_table[0])
+				{
+					if (m_inten_table[1] < other.m_inten_table[1])
+						return true;
+				}
+
+				return false;
+			}
+		};
+
+		typedef basisu::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
+		
+		// Each endpoint cluster's ETC1S parameters 
+		cluster_subblock_etc_params_vec m_endpoint_cluster_etc_params;
+
+		// The endpoint cluster index used by each ETC1 subblock.
+		basisu::vector<vec2U> m_block_endpoint_clusters_indices;
+				
+		// The block(s) within each selector cluster
+		// Note: If you add anything here that uses selector cluster indicies, be sure to update optimize_selector_codebook()!
+		basisu::vector<uint_vec> m_selector_cluster_block_indices;
+
+		// The selector bits for each selector cluster.
+		basisu::vector<etc_block> m_optimized_cluster_selectors;
+
+		// The block(s) within each parent selector cluster.
+		basisu::vector<uint_vec> m_selector_parent_cluster_block_indices;
+		
+		// Each block's parent selector cluster
+		uint8_vec m_block_parent_selector_cluster;
+
+		// Array of selector cluster indices for each parent selector cluster
+		basisu::vector<uint_vec> m_selector_clusters_within_each_parent_cluster;
+				
+		// Each block's selector cluster index
+		basisu::vector<uint32_t> m_block_selector_cluster_index;
+
+		struct subblock_endpoint_quant_err
+		{
+			uint64_t m_total_err;
+			uint32_t m_cluster_index;
+			uint32_t m_cluster_subblock_index;
+			uint32_t m_block_index;
+			uint32_t m_subblock_index;
+
+			bool operator< (const subblock_endpoint_quant_err &rhs) const
+			{
+				if (m_total_err < rhs.m_total_err)
+					return true;
+				else if (m_total_err == rhs.m_total_err)
+				{
+					if (m_block_index < rhs.m_block_index)
+						return true;
+					else if (m_block_index == rhs.m_block_index)
+						return m_subblock_index < rhs.m_subblock_index;
+				}
+				return false;
+			}
+		};
+
+		// The sorted subblock endpoint quant error for each endpoint cluster
+		basisu::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
+
+		std::mutex m_lock;
+
+		bool m_opencl_failed;
+
+		//-----------------------------------------------------------------------------
+
+		void init_etc1_images();
+		bool init_global_codebooks();
+		void init_endpoint_training_vectors();
+		void dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors);
+		void generate_endpoint_clusters();
+		void compute_endpoint_subblock_error_vec();
+		void introduce_new_endpoint_clusters();
+		void generate_endpoint_codebook(uint32_t step);
+		uint32_t refine_endpoint_clusterization();
+		void eliminate_redundant_or_empty_endpoint_clusters();
+		void generate_block_endpoint_clusters();
+		void compute_endpoint_clusters_within_each_parent_cluster();
+		void compute_selector_clusters_within_each_parent_cluster();
+		void create_initial_packed_texture();
+		void generate_selector_clusters();
+		void create_optimized_selector_codebook(uint32_t iter);
+		void find_optimal_selector_clusters_for_each_block();
+		uint32_t refine_block_endpoints_given_selectors();
+		void finalize();
+		bool validate_endpoint_cluster_hierarchy(bool ensure_clusters_have_same_parents) const;
+		bool validate_output() const;
+		void introduce_special_selector_clusters();
+		void optimize_selector_codebook();
+		bool check_etc1s_constraints() const;
+	};
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
@ -0,0 +1,154 @@
+// basisu_gpu_texture.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu.h"
+#include "basisu_etc.h"
+
+namespace basisu
+{
+	// GPU texture "image"
+	class gpu_image
+	{
+	public:
+		enum { cMaxBlockSize = 12 };
+
+		gpu_image()
+		{
+			clear();
+		}
+
+		gpu_image(texture_format fmt, uint32_t width, uint32_t height)
+		{
+			init(fmt, width, height);
+		}
+
+		void clear()
+		{
+			m_fmt = texture_format::cInvalidTextureFormat;
+			m_width = 0;
+			m_height = 0;
+			m_block_width = 0;
+			m_block_height = 0;
+			m_blocks_x = 0;
+			m_blocks_y = 0;
+			m_qwords_per_block = 0;
+			m_blocks.clear();
+		}
+
+		inline texture_format get_format() const { return m_fmt; }
+		
+		// Width/height in pixels
+		inline uint32_t get_pixel_width() const { return m_width; }
+		inline uint32_t get_pixel_height() const { return m_height; }
+		
+		// Width/height in blocks, row pitch is assumed to be m_blocks_x.
+		inline uint32_t get_blocks_x() const { return m_blocks_x; }
+		inline uint32_t get_blocks_y() const { return m_blocks_y; }
+
+		// Size of each block in pixels
+		inline uint32_t get_block_width() const { return m_block_width; }
+		inline uint32_t get_block_height() const { return m_block_height; }
+
+		inline uint32_t get_qwords_per_block() const { return m_qwords_per_block; }
+		inline uint32_t get_total_blocks() const { return m_blocks_x * m_blocks_y; }
+		inline uint32_t get_bytes_per_block() const { return get_qwords_per_block() * sizeof(uint64_t); }
+		inline uint32_t get_row_pitch_in_bytes() const { return get_bytes_per_block() * get_blocks_x(); }
+
+		inline const uint64_vec &get_blocks() const { return m_blocks; }
+		
+		inline const uint64_t *get_ptr() const { return &m_blocks[0]; }
+		inline uint64_t *get_ptr() { return &m_blocks[0]; }
+
+		inline uint32_t get_size_in_bytes() const { return get_total_blocks() * get_qwords_per_block() * sizeof(uint64_t); }
+
+		inline const void *get_block_ptr(uint32_t block_x, uint32_t block_y, uint32_t element_index = 0) const
+		{
+			assert(block_x < m_blocks_x && block_y < m_blocks_y);
+			return &m_blocks[(block_x + block_y * m_blocks_x) * m_qwords_per_block + element_index];
+		}
+
+		inline void *get_block_ptr(uint32_t block_x, uint32_t block_y, uint32_t element_index = 0)
+		{
+			assert(block_x < m_blocks_x && block_y < m_blocks_y && element_index < m_qwords_per_block);
+			return &m_blocks[(block_x + block_y * m_blocks_x) * m_qwords_per_block + element_index];
+		}
+
+		void init(texture_format fmt, uint32_t width, uint32_t height)
+		{
+			m_fmt = fmt;
+			m_width = width;
+			m_height = height;
+			m_block_width = basisu::get_block_width(m_fmt);
+			m_block_height = basisu::get_block_height(m_fmt);
+			m_blocks_x = (m_width + m_block_width - 1) / m_block_width;
+			m_blocks_y = (m_height + m_block_height - 1) / m_block_height;
+			m_qwords_per_block = basisu::get_qwords_per_block(m_fmt);
+
+			m_blocks.resize(0);
+			m_blocks.resize(m_blocks_x * m_blocks_y * m_qwords_per_block);
+		}
+
+		bool unpack(image& img) const;
+		
+		void override_dimensions(uint32_t w, uint32_t h)
+		{
+			m_width = w;
+			m_height = h;
+		}
+
+	private:
+		texture_format m_fmt;
+		uint32_t m_width, m_height, m_blocks_x, m_blocks_y, m_block_width, m_block_height, m_qwords_per_block;
+		uint64_vec m_blocks;
+	};
+
+	typedef basisu::vector<gpu_image> gpu_image_vec;
+
+	// KTX file writing
+
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
+		
+	bool write_compressed_texture_file(const char *pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag);
+	
+	inline bool write_compressed_texture_file(const char *pFilename, const gpu_image_vec &g)
+	{
+		basisu::vector<gpu_image_vec> a;
+		a.push_back(g);
+		return write_compressed_texture_file(pFilename, a, false);
+	}
+
+	bool write_compressed_texture_file(const char *pFilename, const gpu_image &g);
+	
+	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi);
+
+	// GPU texture block unpacking
+	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels);
+	bool unpack_bc1(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha);
+	void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride);
+	bool unpack_bc3(const void *pBlock_bits, color_rgba *pPixels);
+	void unpack_bc5(const void *pBlock_bits, color_rgba *pPixels);
+	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels);
+	bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels);
+	void unpack_atc(const void* pBlock_bits, color_rgba* pPixels);
+	bool unpack_fxt1(const void* p, color_rgba* pPixels);
+	bool unpack_pvrtc2(const void* p, color_rgba* pPixels);
+	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c);
+	void unpack_etc2_eac_rg(const void* p, color_rgba* pPixels);
+
+	// unpack_block() is primarily intended to unpack texture data created by the transcoder.
+	// For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not a complete implementation.
+	bool unpack_block(texture_format fmt, const void *pBlock, color_rgba *pPixels);
+			
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
@ -0,0 +1,27 @@
+// basisu_kernels_declares.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if BASISU_SUPPORT_SSE
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+
+void CPPSPMD_NAME(update_covar_matrix_16x16)(uint32_t num_vecs, const void* pWeighted_vecs, const void *pOrigin, const uint32_t* pVec_indices, void *pMatrix16x16);
+#endif
--- a/engine/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
@ -0,0 +1,647 @@
+// basisu_kernels_imp.h - Do not directly include
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using namespace CPPSPMD;
+
+namespace CPPSPMD_NAME(basisu_kernels_namespace)
+{
+   struct perceptual_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint delta_l = dr * 27 + dg * 92 + db * 9;
+            vint delta_cr = dr * 128 - delta_l;
+            vint delta_cb = db * 128 - delta_l;
+
+            vint id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int delta_l = dr * 27 + dg * 92 + db * 9;
+            int delta_cr = dr * 128 - delta_l;
+            int delta_cb = db * 128 - delta_l;
+
+            int id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct linear_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint id = dr * dr + dg * dg + db * db;
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int id = dr * dr + dg * dg + db * db;
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n,
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+   struct update_covar_matrix_16x16 : spmd_kernel
+   {
+      void _call(
+         uint32_t num_vecs, const void* pWeighted_vecs_void, const void* pOrigin_void, const uint32_t* pVec_indices, void* pMatrix16x16_void)
+      {
+         const std::pair<vec16F, uint64_t>* pWeighted_vecs = static_cast< const std::pair<vec16F, uint64_t> *>(pWeighted_vecs_void);
+         
+         const float* pOrigin = static_cast<const float*>(pOrigin_void);
+         vfloat org0 = loadu_linear_all(pOrigin), org1 = loadu_linear_all(pOrigin + 4), org2 = loadu_linear_all(pOrigin + 8), org3 = loadu_linear_all(pOrigin + 12);
+                  
+         vfloat mat[16][4];
+         vfloat vzero(zero_vfloat());
+
+         for (uint32_t i = 0; i < 16; i++)
+         {
+            store_all(mat[i][0], vzero);
+            store_all(mat[i][1], vzero);
+            store_all(mat[i][2], vzero);
+            store_all(mat[i][3], vzero);
+         }
+
+         for (uint32_t k = 0; k < num_vecs; k++)
+         {
+            const uint32_t vec_index = pVec_indices[k];
+
+            const float* pW = pWeighted_vecs[vec_index].first.get_ptr();
+            vfloat weight((float)pWeighted_vecs[vec_index].second);
+
+            vfloat vec[4] = { loadu_linear_all(pW) - org0, loadu_linear_all(pW + 4) - org1, loadu_linear_all(pW + 8) - org2, loadu_linear_all(pW + 12) - org3 };
+                                                
+            vfloat wvec0 = vec[0] * weight, wvec1 = vec[1] * weight, wvec2 = vec[2] * weight, wvec3 = vec[3] * weight;
+
+            for (uint32_t j = 0; j < 16; j++)
+            {
+               vfloat vx = ((const float*)vec)[j];
+
+               store_all(mat[j][0], mat[j][0] + vx * wvec0);
+               store_all(mat[j][1], mat[j][1] + vx * wvec1);
+               store_all(mat[j][2], mat[j][2] + vx * wvec2);
+               store_all(mat[j][3], mat[j][3] + vx * wvec3);
+
+            } // j
+
+         } // k
+
+         float* pMatrix = static_cast<float*>(pMatrix16x16_void);
+
+         float* pDst = pMatrix;
+         for (uint32_t i = 0; i < 16; i++)
+         {
+            storeu_linear_all(pDst, mat[i][0]);
+            storeu_linear_all(pDst + 4, mat[i][1]);
+            storeu_linear_all(pDst + 8, mat[i][2]);
+            storeu_linear_all(pDst + 12, mat[i][3]);
+            pDst += 16;
+         }
+      }
+   };
+
+} // namespace
+
+using namespace CPPSPMD_NAME(basisu_kernels_namespace);
+
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
+void CPPSPMD_NAME(update_covar_matrix_16x16)(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16)
+{
+   spmd_call < update_covar_matrix_16x16 >(num_vecs, pWeighted_vecs, pOrigin, pVec_indices, pMatrix16x16);
+}
--- a/engine/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
@ -0,0 +1,161 @@
+// basisu_kernels_sse.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+
+#if BASISU_SUPPORT_SSE
+
+#define CPPSPMD_SSE2 0
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#if !defined(_MSC_VER)
+	#if __AVX__ || __AVX2__ || __AVX512F__
+		#error Please check your compiler options
+	#endif
+	
+	#if CPPSPMD_SSE2
+		#if __SSE4_1__ || __SSE3__ || __SSE4_2__ || __SSSE3__
+			#error SSE4.1/SSE3/SSE4.2/SSSE3 cannot be enabled to use this file
+		#endif
+	#else
+		#if !__SSE4_1__ || !__SSE3__ || !__SSSE3__
+			#error Please check your compiler options
+		#endif
+	#endif
+#endif
+
+#include "cppspmd_sse.h"
+
+#include "cppspmd_type_aliases.h"
+
+using namespace basisu;
+
+#include "basisu_kernels_declares.h"
+#include "basisu_kernels_imp.h"
+
+namespace basisu
+{
+
+struct cpu_info
+{
+	cpu_info() { memset(this, 0, sizeof(*this)); }
+
+	bool m_has_fpu;
+	bool m_has_mmx;
+	bool m_has_sse;
+	bool m_has_sse2;
+	bool m_has_sse3;
+	bool m_has_ssse3;
+	bool m_has_sse41;
+	bool m_has_sse42;
+	bool m_has_avx;
+	bool m_has_avx2;
+	bool m_has_pclmulqdq;
+};
+
+static void extract_x86_flags(cpu_info &info, uint32_t ecx, uint32_t edx)
+{
+	info.m_has_fpu = (edx & (1 << 0)) != 0;
+	info.m_has_mmx = (edx & (1 << 23)) != 0;
+	info.m_has_sse = (edx & (1 << 25)) != 0;
+	info.m_has_sse2 = (edx & (1 << 26)) != 0;
+	info.m_has_sse3 = (ecx & (1 << 0)) != 0;
+	info.m_has_ssse3 = (ecx & (1 << 9)) != 0;
+	info.m_has_sse41 = (ecx & (1 << 19)) != 0;
+	info.m_has_sse42 = (ecx & (1 << 20)) != 0;
+	info.m_has_pclmulqdq = (ecx & (1 << 1)) != 0;
+	info.m_has_avx = (ecx & (1 << 28)) != 0;
+}
+
+static void extract_x86_extended_flags(cpu_info &info, uint32_t ebx)
+{
+	info.m_has_avx2 = (ebx & (1 << 5)) != 0;
+}
+
+#ifndef _MSC_VER
+static void do_cpuid(uint32_t eax, uint32_t ecx, uint32_t* regs)
+{
+	uint32_t ebx = 0, edx = 0;
+
+#if defined(__PIC__) && defined(__i386__)
+	__asm__("movl %%ebx, %%edi;"
+		"cpuid;"
+		"xchgl %%ebx, %%edi;"
+		: "=D"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#else
+	__asm__("cpuid;" : "+b"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#endif
+
+	regs[0] = eax; regs[1] = ebx; regs[2] = ecx; regs[3] = edx;
+}
+#endif
+
+static void get_cpuinfo(cpu_info &info)
+{
+	int regs[4];
+
+#ifdef _MSC_VER
+	__cpuid(regs, 0);
+#else
+	do_cpuid(0, 0, (uint32_t *)regs);
+#endif
+
+	const uint32_t max_eax = regs[0];
+
+	if (max_eax >= 1U)
+	{
+#ifdef _MSC_VER
+		__cpuid(regs, 1);
+#else
+		do_cpuid(1, 0, (uint32_t*)regs);
+#endif
+		extract_x86_flags(info, regs[2], regs[3]);
+	}
+
+	if (max_eax >= 7U)
+	{
+#ifdef _MSC_VER
+		__cpuidex(regs, 7, 0);
+#else
+		do_cpuid(7, 0, (uint32_t*)regs);
+#endif
+
+		extract_x86_extended_flags(info, regs[1]);
+	}
+}
+
+void detect_sse41()
+{
+	cpu_info info;
+	get_cpuinfo(info);
+
+	// Check for everything from SSE to SSE 4.1
+	g_cpu_supports_sse41 = info.m_has_sse && info.m_has_sse2 && info.m_has_sse3 && info.m_has_ssse3 && info.m_has_sse41;
+}
+
+} // namespace basisu
+#else // #if BASISU_SUPPORT_SSE
+namespace basisu
+{
+
+void detect_sse41()
+{
+}
+
+} // namespace basisu
+#endif // #if BASISU_SUPPORT_SSE
+
--- a/engine/thirdparty/basis_universal/encoder/basisu_miniz.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_miniz.h
--- a/engine/thirdparty/basis_universal/encoder/basisu_ocl_kernels.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_ocl_kernels.h
--- a/engine/thirdparty/basis_universal/encoder/basisu_opencl.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_opencl.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_opencl.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_opencl.h
@ -0,0 +1,143 @@
+// basisu_opencl.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Note: Undefine or set BASISU_SUPPORT_OPENCL to 0 to completely OpenCL support.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu.h"
+#include "basisu_enc.h"
+#include "basisu_etc.h"
+
+namespace basisu
+{
+	bool opencl_init(bool force_serialization);
+	void opencl_deinit();
+	bool opencl_is_available();
+
+	struct opencl_context;
+
+	// Each thread calling OpenCL should have its own opencl_context_ptr. This corresponds to a OpenCL command queue. (Confusingly, we only use a single OpenCL device "context".)
+	typedef opencl_context* opencl_context_ptr;
+
+	opencl_context_ptr opencl_create_context();
+	void opencl_destroy_context(opencl_context_ptr context);
+
+#pragma pack(push, 1)
+	struct cl_pixel_block
+	{
+		color_rgba m_pixels[16]; // [y*4+x]
+	};
+#pragma pack(pop)
+
+	// Must match BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE
+	const uint32_t OPENCL_ENCODE_ETC1S_MAX_PERMS = 165;
+
+	bool opencl_set_pixel_blocks(opencl_context_ptr pContext, uint32_t total_blocks, const cl_pixel_block* pPixel_blocks);
+
+	bool opencl_encode_etc1s_blocks(opencl_context_ptr pContext, etc_block* pOutput_blocks, bool perceptual, uint32_t total_perms);
+
+	// opencl_encode_etc1s_pixel_clusters
+
+#pragma pack(push, 1)
+	struct cl_pixel_cluster
+	{
+		uint64_t m_total_pixels;
+		uint64_t m_first_pixel_index;
+	};
+#pragma pack(pop)
+
+	bool opencl_encode_etc1s_pixel_clusters(
+		opencl_context_ptr pContext,
+		etc_block* pOutput_blocks, 
+		uint32_t total_clusters,
+		const cl_pixel_cluster *pClusters,
+		uint64_t total_pixels,
+		const color_rgba *pPixels,
+		const uint32_t *pPixel_weights,
+		bool perceptual, uint32_t total_perms);
+
+	// opencl_refine_endpoint_clusterization
+
+#pragma pack(push, 1)
+	struct cl_block_info_struct
+	{
+		uint16_t m_first_cluster_ofs;
+		uint16_t m_num_clusters;
+		uint16_t m_cur_cluster_index;
+		uint8_t m_cur_cluster_etc_inten;
+	};
+
+	struct cl_endpoint_cluster_struct
+	{
+		color_rgba m_unscaled_color;
+		uint8_t m_etc_inten;
+		uint16_t m_cluster_index;
+	};
+#pragma pack(pop)
+
+	bool opencl_refine_endpoint_clusterization(
+		opencl_context_ptr pContext,
+		const cl_block_info_struct *pPixel_block_info,
+		uint32_t total_clusters,
+		const cl_endpoint_cluster_struct *pCluster_info,
+		const uint32_t *pSorted_block_indices,
+		uint32_t* pOutput_cluster_indices, 
+		bool perceptual);
+
+	// opencl_find_optimal_selector_clusters_for_each_block
+
+#pragma pack(push, 1)
+	struct fosc_selector_struct
+	{
+		uint32_t m_packed_selectors;	// 4x4 grid of 2-bit selectors
+	};
+
+	struct fosc_block_struct
+	{
+		color_rgba m_etc_color5_inten;  // unscaled 5-bit block color in RGB, alpha has block's intensity index
+		uint32_t m_first_selector;		// offset into selector table
+		uint32_t m_num_selectors;		// number of selectors to check
+	};
+
+	struct fosc_param_struct
+	{
+		uint32_t m_total_blocks;
+		int m_perceptual;
+	};
+#pragma pack(pop)
+
+	bool opencl_find_optimal_selector_clusters_for_each_block(
+		opencl_context_ptr pContext,
+		const fosc_block_struct* pInput_block_info,	// one per block
+		uint32_t total_input_selectors,
+		const fosc_selector_struct* pInput_selectors,
+		const uint32_t* pSelector_cluster_indices,
+		uint32_t* pOutput_selector_cluster_indices, // one per block
+		bool perceptual);
+
+#pragma pack(push, 1)
+	struct ds_param_struct
+	{
+		uint32_t m_total_blocks;
+		int m_perceptual;
+	};
+#pragma pack(pop)
+
+	bool opencl_determine_selectors(
+		opencl_context_ptr pContext,
+		const color_rgba* pInput_etc_color5_and_inten,
+		etc_block* pOutput_blocks,
+		bool perceptual);
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
@ -0,0 +1,564 @@
+// basisu_pvrtc1_4.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_pvrtc1_4.h"
+
+namespace basisu
+{
+#if 0
+	static const uint8_t g_pvrtc_5[32] = { 0,8,16,24,33,41,49,57,66,74,82,90,99,107,115,123,132,140,148,156,165,173,181,189,198,206,214,222,231,239,247,255 };
+	static const uint8_t g_pvrtc_4[16] = { 0,16,33,49,66,82,99,115,140,156,173,189,206,222,239,255 };
+	static const uint8_t g_pvrtc_3[8] = { 0,33,74,107,148,181,222,255 };
+	static const uint8_t g_pvrtc_alpha[9] = { 0,34,68,102,136,170,204,238,255 };
+#endif
+
+	static const uint8_t g_pvrtc_5_nearest[256] = { 0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31 };
+	static const uint8_t g_pvrtc_4_nearest[256] = { 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15 };
+#if 0
+	static const uint8_t g_pvrtc_3_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 };
+	static const uint8_t g_pvrtc_alpha_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8 };
+#endif
+
+#if 0
+	static const uint8_t g_pvrtc_5_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,
+		3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,
+		7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,
+		11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,
+		15,15,15,15,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,
+		19,19,19,19,19,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,
+		23,23,23,23,23,23,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,
+		27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31
+	};
+
+	static const uint8_t g_pvrtc_5_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,
+		4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,
+		8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,
+		12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,
+		16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,
+		20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,
+		24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,
+		28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31,31,31,31,31
+	};
+
+	static const uint8_t g_pvrtc_4_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,
+		9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,
+		11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,
+		13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15
+	};
+
+	static const uint8_t g_pvrtc_4_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,
+		8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,
+		10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,
+		12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,
+		14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+	};
+
+	static const uint8_t g_pvrtc_3_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7
+	};
+
+	static const uint8_t g_pvrtc_3_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+	};
+
+	static const uint8_t g_pvrtc_alpha_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8
+	};
+
+	static const uint8_t g_pvrtc_alpha_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+	};
+#endif
+
+	uint32_t pvrtc4_swizzle_uv(uint32_t width, uint32_t height, uint32_t x, uint32_t y)
+	{
+		assert((x < width) && (y < height) && basisu::is_pow2(height) && basisu::is_pow2(width));
+				
+		uint32_t min_d = width, max_v = y;
+		if (height < width)
+		{
+			min_d = height;
+			max_v = x;
+		}
+
+		// Interleave the XY LSB's
+		uint32_t shift_ofs = 0, swizzled = 0;
+		for (uint32_t s_bit = 1, d_bit = 1; s_bit < min_d; s_bit <<= 1, d_bit <<= 2, ++shift_ofs)
+		{
+			if (y & s_bit) swizzled |= d_bit;
+			if (x & s_bit) swizzled |= (2 * d_bit);
+		}
+
+		max_v >>= shift_ofs;
+		
+		// OR in the rest of the bits from the largest dimension
+		swizzled |= (max_v << (2 * shift_ofs));
+
+		return swizzled;
+	}
+
+	color_rgba pvrtc4_block::get_endpoint(uint32_t endpoint_index, bool unpack) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+					
+				if (unpack)
+				{
+					b = (b << 1) | (b >> 3);
+				}
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = unpack ? 255 : 7;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+						
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 2) | (b >> 1);
+				}
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 1) | (b >> 3);
+				}
+			}
+		}
+
+		if (unpack)
+		{
+			r = (r << 3) | (r >> 2);
+			g = (g << 3) | (g >> 2);
+			b = (b << 3) | (b >> 2);
+		}
+
+		assert((r < 256) && (g < 256) && (b < 256) && (a < 256));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	color_rgba pvrtc4_block::get_endpoint_5554(uint32_t endpoint_index) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+
+				b = (b << 1) | (b >> 3);
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = 15;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 2) | (b >> 1);
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 1) | (b >> 3);
+			}
+		}
+						
+		assert((r < 32) && (g < 32) && (b < 32) && (a < 16));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	bool pvrtc4_image::get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		pColors[0] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+		pColors[3] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				uint32_t m = (pColors[0][c] + pColors[3][c]) / 2;
+				pColors[1][c] = static_cast<uint8_t>(m);
+				pColors[2][c] = static_cast<uint8_t>(m);
+			}
+			pColors[2][3] = 0;
+			return true;
+		}
+
+		for (uint32_t c = 0; c < 4; c++)
+		{
+			pColors[1][c] = static_cast<uint8_t>((pColors[0][c] * 5 + pColors[3][c] * 3) / 8);
+			pColors[2][c] = static_cast<uint8_t>((pColors[0][c] * 3 + pColors[3][c] * 5) / 8);
+		}
+
+		return false;
+	}
+		
+	color_rgba pvrtc4_image::get_pixel(uint32_t x, uint32_t y, uint32_t m) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			return color_rgba((l[0] + h[0]) / 2, (l[1] + h[1]) / 2, (l[2] + h[2]) / 2, (m == 2) ? 0 : (l[3] + h[3]) / 2);
+		}
+		else
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			if (m == 2)
+				return color_rgba((l[0] * 3 + h[0] * 5) / 8, (l[1] * 3 + h[1] * 5) / 8, (l[2] * 3 + h[2] * 5) / 8, (l[3] * 3 + h[3] * 5) / 8);
+			else
+				return color_rgba((l[0] * 5 + h[0] * 3) / 8, (l[1] * 5 + h[1] * 3) / 8, (l[2] * 5 + h[2] * 3) / 8, (l[3] * 5 + h[3] * 3) / 8);
+		}
+	}
+
+	uint64_t pvrtc4_image::local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual)
+	{
+		uint64_t initial_error = evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false);
+		if (!initial_error)
+			return initial_error;
+
+		vec3F c_avg_orig(0);
+
+		for (int y = 0; y < 7; y++)
+		{
+			const uint32_t py = wrap_y(by * 4 + y - 1);
+			for (uint32_t x = 0; x < 7; x++)
+			{
+				const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+				const color_rgba& c = orig_img(px, py);
+
+				c_avg_orig[0] += c[0];
+				c_avg_orig[1] += c[1];
+				c_avg_orig[2] += c[2];
+			}
+		}
+
+		c_avg_orig *= 1.0f / 49.0f;
+
+		vec3F quant_colors[2];
+		quant_colors[0].set(c_avg_orig);
+		quant_colors[0] -= vec3F(.0125f);
+
+		quant_colors[1].set(c_avg_orig);
+		quant_colors[1] += vec3F(.0125f);
+
+		float total_weight[2];
+
+		bool success = true;
+
+		for (uint32_t pass = 0; pass < 4; pass++)
+		{
+			vec3F new_colors[2] = { vec3F(0), vec3F(0) };
+			memset(total_weight, 0, sizeof(total_weight));
+
+			static const float s_weights[7][7] =
+			{
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 2.242640f, 3.242640f, 4.242640f, 5.000000f, 4.242640f, 3.242640f, 2.242640f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f }
+			};
+
+			for (int y = 0; y < 7; y++)
+			{
+				const uint32_t py = wrap_y(by * 4 + y - 1);
+				for (uint32_t x = 0; x < 7; x++)
+				{
+					const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+					const color_rgba& orig_c = orig_img(px, py);
+
+					vec3F color(orig_c[0], orig_c[1], orig_c[2]);
+
+					uint32_t c = quant_colors[0].squared_distance(color) > quant_colors[1].squared_distance(color);
+
+					const float weight = s_weights[y][x];
+					new_colors[c] += color * weight;
+
+					total_weight[c] += weight;
+				}
+			}
+
+			if (!total_weight[0] || !total_weight[1])
+				success = false;
+
+			quant_colors[0] = new_colors[0] / (float)total_weight[0];
+			quant_colors[1] = new_colors[1] / (float)total_weight[1];
+		}
+
+		if (!success)
+		{
+			quant_colors[0] = c_avg_orig;
+			quant_colors[1] = c_avg_orig;
+		}
+
+		vec4F colors[2] = { quant_colors[0], quant_colors[1] };
+
+		colors[0] += vec3F(.5f);
+		colors[1] += vec3F(.5f);
+		color_rgba color_0((int)colors[0][0], (int)colors[0][1], (int)colors[0][2], 0);
+		color_rgba color_1((int)colors[1][0], (int)colors[1][1], (int)colors[1][2], 0);
+
+		pvrtc4_block cur_blocks[3][3];
+		
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				cur_blocks[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		color_rgba l1(0), h1(0);
+
+		l1[0] = g_pvrtc_5_nearest[color_0[0]];
+		h1[0] = g_pvrtc_5_nearest[color_1[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_0[1]];
+		h1[1] = g_pvrtc_5_nearest[color_1[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_0[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_0 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		pvrtc4_block blocks0[3][3];
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				blocks0[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		l1[0] = g_pvrtc_5_nearest[color_1[0]];
+		h1[0] = g_pvrtc_5_nearest[color_0[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_1[1]];
+		h1[1] = g_pvrtc_5_nearest[color_0[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_1[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_1 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		if (initial_error < basisu::minimum(e03_err_0, e03_err_1))
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = cur_blocks[x + 1][y + 1];
+				}
+			}
+			return initial_error;
+		}
+		else if (e03_err_0 < e03_err_1)
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = blocks0[x + 1][y + 1];
+				}
+			}
+			assert(e03_err_0 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+			return e03_err_0;
+		}
+
+		assert(e03_err_1 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+		return e03_err_1;
+	}
+
+} // basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
@ -0,0 +1,457 @@
+// basisu_pvrtc1_4.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_gpu_texture.h"
+
+namespace basisu
+{
+	enum 
+	{ 
+		PVRTC2_MIN_WIDTH = 16, 
+		PVRTC2_MIN_HEIGHT = 8, 
+		PVRTC4_MIN_WIDTH = 8, 
+		PVRTC4_MIN_HEIGHT = 8 
+	};
+	
+	struct pvrtc4_block
+	{
+		uint32_t m_modulation;
+		uint32_t m_endpoints;
+
+		pvrtc4_block() : m_modulation(0), m_endpoints(0) { }
+
+		inline bool operator== (const pvrtc4_block& rhs) const
+		{
+			return (m_modulation == rhs.m_modulation) && (m_endpoints == rhs.m_endpoints);
+		}
+
+		inline void clear()
+		{
+			m_modulation = 0;
+			m_endpoints = 0;
+		}
+
+		inline bool get_block_uses_transparent_modulation() const
+		{
+			return (m_endpoints & 1) != 0;
+		}
+
+		inline bool is_endpoint_opaque(uint32_t endpoint_index) const
+		{
+			static const uint32_t s_bitmasks[2] = { 0x8000U, 0x80000000U };
+			return (m_endpoints & s_bitmasks[open_range_check(endpoint_index, 2U)]) != 0;
+		}
+
+		// Returns raw endpoint or 8888
+		color_rgba get_endpoint(uint32_t endpoint_index, bool unpack) const;
+		
+		color_rgba get_endpoint_5554(uint32_t endpoint_index) const;
+		
+		static uint32_t get_component_precision_in_bits(uint32_t c, uint32_t endpoint_index, bool opaque_endpoint)
+		{
+			static const uint32_t s_comp_prec[4][4] =
+			{
+				// R0 G0 B0 A0      R1 G1 B1 A1
+				{  4, 4, 3, 3 }, {  4, 4, 4, 3 }, // transparent endpoint
+
+				{  5, 5, 4, 0 }, {  5, 5, 5, 0 }  // opaque endpoint
+			};
+			return s_comp_prec[open_range_check(endpoint_index, 2U) + (opaque_endpoint * 2)][open_range_check(c, 4U)];
+		}
+
+		static color_rgba get_color_precision_in_bits(uint32_t endpoint_index, bool opaque_endpoint)
+		{
+			static const color_rgba s_color_prec[4] =
+			{
+			   color_rgba(4, 4, 3, 3), color_rgba(4, 4, 4, 3), // transparent endpoint
+			   color_rgba(5, 5, 4, 0), color_rgba(5, 5, 5, 0)  // opaque endpoint
+			};
+			return s_color_prec[open_range_check(endpoint_index, 2U) + (opaque_endpoint * 2)];
+		}
+		
+		inline uint32_t get_modulation(uint32_t x, uint32_t y) const
+		{
+			assert((x < 4) && (y < 4));
+			return (m_modulation >> ((y * 4 + x) * 2)) & 3;
+		}
+
+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < 4) && (y < 4) && (s < 4));
+			uint32_t n = (y * 4 + x) * 2;
+			m_modulation = (m_modulation & (~(3 << n))) | (s << n);
+			assert(get_modulation(x, y) == s);
+		}
+
+		// Scaled by 8
+		inline const uint32_t* get_scaled_modulation_values(bool block_uses_transparent_modulation) const
+		{
+			static const uint32_t s_block_scales[2][4] = { { 0, 3, 5, 8 }, { 0, 4, 4, 8 } };
+			return s_block_scales[block_uses_transparent_modulation];
+		}
+
+		// Scaled by 8
+		inline uint32_t get_scaled_modulation(uint32_t x, uint32_t y) const
+		{
+			return get_scaled_modulation_values(get_block_uses_transparent_modulation())[get_modulation(x, y)];
+		}
+
+		inline void byte_swap()
+		{
+			m_modulation = byteswap32(m_modulation);
+			m_endpoints = byteswap32(m_endpoints);
+		}
+
+		// opaque endpoints:	554, 555
+		// transparent endpoints: 3443, 3444
+		inline void set_endpoint_raw(uint32_t endpoint_index, const color_rgba& c, bool opaque_endpoint)
+		{
+			assert(endpoint_index < 2);
+			const uint32_t m = m_endpoints & 1;
+			uint32_t r = c[0], g = c[1], b = c[2], a = c[3];
+						
+			uint32_t packed;
+
+			if (opaque_endpoint)
+			{
+				if (!endpoint_index)
+				{
+					// 554
+					// 1RRRRRGGGGGBBBBM
+					assert((r < 32) && (g < 32) && (b < 16));
+					packed = 0x8000 | (r << 10) | (g << 5) | (b << 1) | m;
+				}
+				else
+				{
+					// 555
+					// 1RRRRRGGGGGBBBBB
+					assert((r < 32) && (g < 32) && (b < 32));
+					packed = 0x8000 | (r << 10) | (g << 5) | b;
+				}
+			}
+			else
+			{
+				if (!endpoint_index)
+				{
+					// 3443
+					// 0AAA RRRR GGGG BBBM
+					assert((r < 16) && (g < 16) && (b < 8) && (a < 8));
+					packed = (a << 12) | (r << 8) | (g << 4) | (b << 1) | m;
+				}
+				else
+				{
+					// 3444
+					// 0AAA RRRR GGGG BBBB
+					assert((r < 16) && (g < 16) && (b < 16) && (a < 8));
+					packed = (a << 12) | (r << 8) | (g << 4) | b;
+				}
+			}
+
+			assert(packed <= 0xFFFF);
+
+			if (endpoint_index)
+				m_endpoints = (m_endpoints & 0xFFFFU) | (packed << 16);
+			else
+				m_endpoints = (m_endpoints & 0xFFFF0000U) | packed;
+		}
+	};
+
+	typedef vector2D<pvrtc4_block> pvrtc4_block_vector2D;
+
+	uint32_t pvrtc4_swizzle_uv(uint32_t XSize, uint32_t YSize, uint32_t XPos, uint32_t YPos);
+
+	class pvrtc4_image
+	{
+	public:
+		inline pvrtc4_image() :
+			m_width(0), m_height(0), m_block_width(0), m_block_height(0), m_uses_alpha(false)
+		{
+		}
+
+		inline pvrtc4_image(uint32_t width, uint32_t height) :
+			m_width(0), m_height(0), m_block_width(0), m_block_height(0), m_uses_alpha(false)
+		{
+			resize(width, height);
+		}
+
+		inline void clear()
+		{
+			m_width = 0;
+			m_height = 0;
+			m_block_width = 0;
+			m_block_height = 0;
+			m_blocks.clear();
+			m_uses_alpha = false;
+		}
+
+		inline void resize(uint32_t width, uint32_t height)
+		{
+			if ((width == m_width) && (height == m_height))
+				return;
+
+			m_width = width;
+			m_height = height;
+
+			m_block_width = (width + 3) >> 2;
+			m_block_height = (height + 3) >> 2;
+
+			m_blocks.resize(m_block_width, m_block_height);
+		}
+
+		inline uint32_t get_width() const { return m_width; }
+		inline uint32_t get_height() const { return m_height; }
+
+		inline uint32_t get_block_width() const { return m_block_width; }
+		inline uint32_t get_block_height() const { return m_block_height; }
+
+		inline const pvrtc4_block_vector2D &get_blocks() const { return m_blocks; }
+		inline		 pvrtc4_block_vector2D &get_blocks() { return m_blocks; }
+
+		inline uint32_t get_total_blocks() const { return m_block_width * m_block_height; }
+
+		inline bool get_uses_alpha() const { return m_uses_alpha; }
+		inline void set_uses_alpha(bool uses_alpha) { m_uses_alpha = uses_alpha; }
+
+		inline bool are_blocks_equal(const pvrtc4_image& rhs) const
+		{
+			return m_blocks == rhs.m_blocks;
+		}
+
+		inline void set_to_black()
+		{
+			memset(m_blocks.get_ptr(), 0, m_blocks.size_in_bytes());
+		}
+
+		inline bool get_block_uses_transparent_modulation(uint32_t bx, uint32_t by) const
+		{
+			return m_blocks(bx, by).get_block_uses_transparent_modulation();
+		}
+
+		inline bool is_endpoint_opaque(uint32_t bx, uint32_t by, uint32_t endpoint_index) const
+		{
+			return m_blocks(bx, by).is_endpoint_opaque(endpoint_index);
+		}
+				
+		color_rgba get_endpoint(uint32_t bx, uint32_t by, uint32_t endpoint_index, bool unpack) const
+		{
+			assert((bx < m_block_width) && (by < m_block_height));
+			return m_blocks(bx, by).get_endpoint(endpoint_index, unpack);
+		}
+
+		inline uint32_t get_modulation(uint32_t x, uint32_t y) const
+		{
+			assert((x < m_width) && (y < m_height));
+			return m_blocks(x >> 2, y >> 2).get_modulation(x & 3, y & 3);
+		}
+				
+		// Returns true if the block uses transparent modulation.
+		bool get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const;
+		
+		color_rgba get_pixel(uint32_t x, uint32_t y, uint32_t m) const;
+		
+		inline color_rgba get_pixel(uint32_t x, uint32_t y) const
+		{
+			assert((x < m_width) && (y < m_height));
+			return get_pixel(x, y, m_blocks(x >> 2, y >> 2).get_modulation(x & 3, y & 3));
+		}
+
+		void deswizzle()
+		{
+			pvrtc4_block_vector2D temp(m_blocks);
+
+			for (uint32_t y = 0; y < m_block_height; y++)
+				for (uint32_t x = 0; x < m_block_width; x++)
+					m_blocks(x, y) = temp[pvrtc4_swizzle_uv(m_block_width, m_block_height, x, y)];
+		}
+
+		void swizzle()
+		{
+			pvrtc4_block_vector2D temp(m_blocks);
+
+			for (uint32_t y = 0; y < m_block_height; y++)
+				for (uint32_t x = 0; x < m_block_width; x++)
+					m_blocks[pvrtc4_swizzle_uv(m_block_width, m_block_height, x, y)] = temp(x, y);
+		}
+
+		void unpack_all_pixels(image& img) const
+		{
+			img.crop(m_width, m_height);
+
+			for (uint32_t y = 0; y < m_height; y++)
+				for (uint32_t x = 0; x < m_width; x++)
+					img(x, y) = get_pixel(x, y);
+		}
+
+		void unpack_block(image &dst, uint32_t block_x, uint32_t block_y)
+		{
+			for (uint32_t y = 0; y < 4; y++)
+				for (uint32_t x = 0; x < 4; x++)
+					dst(x, y) = get_pixel(block_x * 4 + x, block_y * 4 + y);
+		}
+
+		inline int wrap_x(int x) const
+		{
+			return posmod(x, m_width);
+		}
+
+		inline int wrap_y(int y) const
+		{
+			return posmod(y, m_height);
+		}
+
+		inline int wrap_block_x(int bx) const
+		{
+			return posmod(bx, m_block_width);
+		}
+
+		inline int wrap_block_y(int by) const
+		{
+			return posmod(by, m_block_height);
+		}
+
+		inline vec2F get_interpolation_factors(uint32_t x, uint32_t y) const
+		{
+			// 0 1 2 3
+			// 2 3 0 1
+			// .5 .75 0 .25
+			static const float s_interp[4] = { 2, 3, 0, 1 };
+			return vec2F(s_interp[x & 3], s_interp[y & 3]);
+		}
+
+		inline color_rgba interpolate(int x, int y,
+			const color_rgba& p, const color_rgba& q,
+			const color_rgba& r, const color_rgba& s) const
+		{
+			static const int s_interp[4] = { 2, 3, 0, 1 };
+			const int u_interp = s_interp[x & 3];
+			const int v_interp = s_interp[y & 3];
+
+			color_rgba result;
+
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				int t = p[c] * 4 + u_interp * ((int)q[c] - (int)p[c]);
+				int b = r[c] * 4 + u_interp * ((int)s[c] - (int)r[c]);
+				int v = t * 4 + v_interp * (b - t);
+				if (c < 3)
+				{
+					v >>= 1;
+					v += (v >> 5);
+				}
+				else
+				{
+					v += (v >> 4);
+				}
+				assert((v >= 0) && (v < 256));
+				result[c] = static_cast<uint8_t>(v);
+			}
+
+			return result;
+		}
+
+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < m_width) && (y < m_height));
+			return m_blocks(x >> 2, y >> 2).set_modulation(x & 3, y & 3, s);
+		}
+
+		inline uint64_t map_pixel(uint32_t x, uint32_t y, const color_rgba& c, bool perceptual, bool alpha_is_significant, bool record = true)
+		{
+			color_rgba v[4];
+			get_interpolated_colors(x, y, v);
+
+			uint64_t best_dist = color_distance(perceptual, c, v[0], alpha_is_significant);
+			uint32_t best_v = 0;
+			for (uint32_t i = 1; i < 4; i++)
+			{
+				uint64_t dist = color_distance(perceptual, c, v[i], alpha_is_significant);
+				if (dist < best_dist)
+				{
+					best_dist = dist;
+					best_v = i;
+				}
+			}
+
+			if (record)
+				set_modulation(x, y, best_v);
+
+			return best_dist;
+		}
+
+		inline uint64_t remap_pixels_influenced_by_endpoint(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant)
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += map_pixel(x, y, orig_img(x, y), perceptual, alpha_is_significant);
+				}
+			}
+
+			return total_error;
+		}
+
+		inline uint64_t evaluate_1x1_endpoint_error(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant, uint64_t threshold_error = 0) const
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += color_distance(perceptual, get_pixel(x, y), orig_img(x, y), alpha_is_significant);
+
+					if ((threshold_error) && (total_error >= threshold_error))
+						return total_error;
+				}
+			}
+
+			return total_error;
+		}
+
+		uint64_t local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual);
+
+		inline uint64_t map_all_pixels(const image& img, bool perceptual, bool alpha_is_significant)
+		{
+			assert(m_width == img.get_width());
+			assert(m_height == img.get_height());
+
+			uint64_t total_error = 0;
+			for (uint32_t y = 0; y < img.get_height(); y++)
+				for (uint32_t x = 0; x < img.get_width(); x++)
+					total_error += map_pixel(x, y, img(x, y), perceptual, alpha_is_significant);
+
+			return total_error;
+		}
+	
+	public:						
+		uint32_t m_width, m_height;
+		pvrtc4_block_vector2D m_blocks;
+		uint32_t m_block_width, m_block_height;
+						
+		bool m_uses_alpha;
+	};
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
@ -0,0 +1,340 @@
+// basisu_resampler_filters.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_resampler_filters.h"
+
+#ifndef M_PI
+	#define M_PI 3.14159265358979323846
+#endif
+
+namespace basisu
+{
+#define BOX_FILTER_SUPPORT (0.5f)
+	static float box_filter(float t) /* pulse/Fourier window */
+	{
+		// make_clist() calls the filter function with t inverted (pos = left, neg = right)
+		if ((t >= -0.5f) && (t < 0.5f))
+			return 1.0f;
+		else
+			return 0.0f;
+	}
+
+#define TENT_FILTER_SUPPORT (1.0f)
+	static float tent_filter(float t) /* box (*) box, bilinear/triangle */
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 1.0f)
+			return 1.0f - t;
+		else
+			return 0.0f;
+	}
+
+#define BELL_SUPPORT (1.5f)
+	static float bell_filter(float t) /* box (*) box (*) box */
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < .5f)
+			return (.75f - (t * t));
+
+		if (t < 1.5f)
+		{
+			t = (t - 1.5f);
+			return (.5f * (t * t));
+		}
+
+		return (0.0f);
+	}
+
+#define B_SPLINE_SUPPORT (2.0f)
+	static float B_spline_filter(float t) /* box (*) box (*) box (*) box */
+	{
+		float tt;
+
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 1.0f)
+		{
+			tt = t * t;
+			return ((.5f * tt * t) - tt + (2.0f / 3.0f));
+		}
+		else if (t < 2.0f)
+		{
+			t = 2.0f - t;
+			return ((1.0f / 6.0f) * (t * t * t));
+		}
+
+		return (0.0f);
+	}
+
+	// Dodgson, N., "Quadratic Interpolation for Image Resampling"
+#define QUADRATIC_SUPPORT 1.5f
+	static float quadratic(float t, const float R)
+	{
+		if (t < 0.0f)
+			t = -t;
+		if (t < QUADRATIC_SUPPORT)
+		{
+			float tt = t * t;
+			if (t <= .5f)
+				return (-2.0f * R) * tt + .5f * (R + 1.0f);
+			else
+				return (R * tt) + (-2.0f * R - .5f) * t + (3.0f / 4.0f) * (R + 1.0f);
+		}
+		else
+			return 0.0f;
+	}
+
+	static float quadratic_interp_filter(float t)
+	{
+		return quadratic(t, 1.0f);
+	}
+
+	static float quadratic_approx_filter(float t)
+	{
+		return quadratic(t, .5f);
+	}
+
+	static float quadratic_mix_filter(float t)
+	{
+		return quadratic(t, .8f);
+	}
+
+	// Mitchell, D. and A. Netravali, "Reconstruction Filters in Computer Graphics."
+	// Computer Graphics, Vol. 22, No. 4, pp. 221-228.
+	// (B, C)
+	// (1/3, 1/3)  - Defaults recommended by Mitchell and Netravali
+	// (1, 0)	   - Equivalent to the Cubic B-Spline
+	// (0, 0.5)		- Equivalent to the Catmull-Rom Spline
+	// (0, C)		- The family of Cardinal Cubic Splines
+	// (B, 0)		- Duff's tensioned B-Splines.
+	static float mitchell(float t, const float B, const float C)
+	{
+		float tt;
+
+		tt = t * t;
+
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 1.0f)
+		{
+			t = (((12.0f - 9.0f * B - 6.0f * C) * (t * tt)) + ((-18.0f + 12.0f * B + 6.0f * C) * tt) + (6.0f - 2.0f * B));
+
+			return (t / 6.0f);
+		}
+		else if (t < 2.0f)
+		{
+			t = (((-1.0f * B - 6.0f * C) * (t * tt)) + ((6.0f * B + 30.0f * C) * tt) + ((-12.0f * B - 48.0f * C) * t) + (8.0f * B + 24.0f * C));
+
+			return (t / 6.0f);
+		}
+
+		return (0.0f);
+	}
+
+#define MITCHELL_SUPPORT (2.0f)
+	static float mitchell_filter(float t)
+	{
+		return mitchell(t, 1.0f / 3.0f, 1.0f / 3.0f);
+	}
+
+#define CATMULL_ROM_SUPPORT (2.0f)
+	static float catmull_rom_filter(float t)
+	{
+		return mitchell(t, 0.0f, .5f);
+	}
+
+	static double sinc(double x)
+	{
+		x = (x * M_PI);
+
+		if ((x < 0.01f) && (x > -0.01f))
+			return 1.0f + x * x * (-1.0f / 6.0f + x * x * 1.0f / 120.0f);
+
+		return sin(x) / x;
+	}
+
+	static float clean(double t)
+	{
+		const float EPSILON = .0000125f;
+		if (fabs(t) < EPSILON)
+			return 0.0f;
+		return (float)t;
+	}
+
+	//static double blackman_window(double x)
+	//{
+	//	return .42f + .50f * cos(M_PI*x) + .08f * cos(2.0f*M_PI*x);
+	//}
+
+	static double blackman_exact_window(double x)
+	{
+		return 0.42659071f + 0.49656062f * cos(M_PI * x) + 0.07684867f * cos(2.0f * M_PI * x);
+	}
+
+#define BLACKMAN_SUPPORT (3.0f)
+	static float blackman_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 3.0f)
+			//return clean(sinc(t) * blackman_window(t / 3.0f));
+			return clean(sinc(t) * blackman_exact_window(t / 3.0f));
+		else
+			return (0.0f);
+	}
+
+#define GAUSSIAN_SUPPORT (1.25f)
+	static float gaussian_filter(float t) // with blackman window
+	{
+		if (t < 0)
+			t = -t;
+		if (t < GAUSSIAN_SUPPORT)
+			return clean(exp(-2.0f * t * t) * sqrt(2.0f / M_PI) * blackman_exact_window(t / GAUSSIAN_SUPPORT));
+		else
+			return 0.0f;
+	}
+
+	// Windowed sinc -- see "Jimm Blinn's Corner: Dirty Pixels" pg. 26.
+#define LANCZOS3_SUPPORT (3.0f)
+	static float lanczos3_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 3.0f)
+			return clean(sinc(t) * sinc(t / 3.0f));
+		else
+			return (0.0f);
+	}
+
+#define LANCZOS4_SUPPORT (4.0f)
+	static float lanczos4_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 4.0f)
+			return clean(sinc(t) * sinc(t / 4.0f));
+		else
+			return (0.0f);
+	}
+
+#define LANCZOS6_SUPPORT (6.0f)
+	static float lanczos6_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 6.0f)
+			return clean(sinc(t) * sinc(t / 6.0f));
+		else
+			return (0.0f);
+	}
+
+#define LANCZOS12_SUPPORT (12.0f)
+	static float lanczos12_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 12.0f)
+			return clean(sinc(t) * sinc(t / 12.0f));
+		else
+			return (0.0f);
+	}
+
+	static double bessel0(double x)
+	{
+		const double EPSILON_RATIO = 1E-16;
+		double xh, sum, pow, ds;
+		int k;
+
+		xh = 0.5 * x;
+		sum = 1.0;
+		pow = 1.0;
+		k = 0;
+		ds = 1.0;
+		while (ds > sum * EPSILON_RATIO) // FIXME: Shouldn't this stop after X iterations for max. safety?
+		{
+			++k;
+			pow = pow * (xh / k);
+			ds = pow * pow;
+			sum = sum + ds;
+		}
+
+		return sum;
+	}
+
+	//static const float KAISER_ALPHA = 4.0;
+	static double kaiser(double alpha, double half_width, double x)
+	{
+		const double ratio = (x / half_width);
+		return bessel0(alpha * sqrt(1 - ratio * ratio)) / bessel0(alpha);
+	}
+
+#define KAISER_SUPPORT 3
+	static float kaiser_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < KAISER_SUPPORT)
+		{
+			// db atten
+			const float att = 40.0f;
+			const float alpha = (float)(exp(log((double)0.58417 * (att - 20.96)) * 0.4) + 0.07886 * (att - 20.96));
+			//const float alpha = KAISER_ALPHA;
+			return (float)clean(sinc(t) * kaiser(alpha, KAISER_SUPPORT, t));
+		}
+
+		return 0.0f;
+	}
+
+	const resample_filter g_resample_filters[] =
+	{
+		{ "box", box_filter, BOX_FILTER_SUPPORT }, 
+		{ "tent", tent_filter, TENT_FILTER_SUPPORT }, 
+		{ "bell", bell_filter, BELL_SUPPORT }, 
+		{ "b-spline", B_spline_filter, B_SPLINE_SUPPORT },
+		{ "mitchell", mitchell_filter, MITCHELL_SUPPORT }, 
+		{ "blackman", blackman_filter, BLACKMAN_SUPPORT }, 
+		{ "lanczos3", lanczos3_filter, LANCZOS3_SUPPORT },
+		{ "lanczos4", lanczos4_filter, LANCZOS4_SUPPORT },
+		{ "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, 
+		{ "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, 
+		{ "kaiser", kaiser_filter, KAISER_SUPPORT }, 
+		{ "gaussian", gaussian_filter, GAUSSIAN_SUPPORT },
+		{ "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, 
+		{ "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_mix", quadratic_mix_filter, QUADRATIC_SUPPORT },
+	};
+
+	const int g_num_resample_filters = BASISU_ARRAY_SIZE(g_resample_filters);
+
+	int find_resample_filter(const char *pName)
+	{
+		for (int i = 0; i < g_num_resample_filters; i++)
+			if (strcmp(pName, g_resample_filters[i].name) == 0)
+				return i;
+		return -1;
+	}
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_resampler.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_resampler.cpp
@ -0,0 +1,844 @@
+// basisu_resampler.cpp
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_resampler.h"
+#include "basisu_resampler_filters.h"
+
+#define RESAMPLER_DEBUG 0
+
+namespace basisu
+{
+	static inline int resampler_range_check(int v, int h)
+	{
+		BASISU_NOTE_UNUSED(h);
+		assert((v >= 0) && (v < h));
+		return v;
+	}
+
+	// Float to int cast with truncation.
+	static inline int cast_to_int(Resample_Real i)
+	{
+		return (int)i;
+	}
+
+	// Ensure that the contributing source sample is within bounds. If not, reflect, clamp, or wrap.
+	int Resampler::reflect(const int j, const int src_x, const Boundary_Op boundary_op)
+	{
+		int n;
+
+		if (j < 0)
+		{
+			if (boundary_op == BOUNDARY_REFLECT)
+			{
+				n = -j;
+
+				if (n >= src_x)
+					n = src_x - 1;
+			}
+			else if (boundary_op == BOUNDARY_WRAP)
+				n = posmod(j, src_x);
+			else
+				n = 0;
+		}
+		else if (j >= src_x)
+		{
+			if (boundary_op == BOUNDARY_REFLECT)
+			{
+				n = (src_x - j) + (src_x - 1);
+
+				if (n < 0)
+					n = 0;
+			}
+			else if (boundary_op == BOUNDARY_WRAP)
+				n = posmod(j, src_x);
+			else
+				n = src_x - 1;
+		}
+		else
+			n = j;
+
+		return n;
+	}
+
+	// The make_clist() method generates, for all destination samples,
+	// the list of all source samples with non-zero weighted contributions.
+	Resampler::Contrib_List * Resampler::make_clist(
+		int src_x, int dst_x, Boundary_Op boundary_op,
+		Resample_Real(*Pfilter)(Resample_Real),
+		Resample_Real filter_support,
+		Resample_Real filter_scale,
+		Resample_Real src_ofs)
+	{
+		struct Contrib_Bounds
+		{
+			// The center of the range in DISCRETE coordinates (pixel center = 0.0f).
+			Resample_Real center;
+			int left, right;
+		};
+
+		int i, j, k, n, left, right;
+		Resample_Real total_weight;
+		Resample_Real xscale, center, half_width, weight;
+		Contrib_List* Pcontrib;
+		Contrib* Pcpool;
+		Contrib* Pcpool_next;
+		Contrib_Bounds* Pcontrib_bounds;
+
+		if ((Pcontrib = (Contrib_List*)calloc(dst_x, sizeof(Contrib_List))) == NULL)
+			return NULL;
+
+		Pcontrib_bounds = (Contrib_Bounds*)calloc(dst_x, sizeof(Contrib_Bounds));
+		if (!Pcontrib_bounds)
+		{
+			free(Pcontrib);
+			return (NULL);
+		}
+
+		const Resample_Real oo_filter_scale = 1.0f / filter_scale;
+
+		const Resample_Real NUDGE = 0.5f;
+		xscale = dst_x / (Resample_Real)src_x;
+
+		if (xscale < 1.0f)
+		{
+			int total;
+			(void)total;
+
+			// Handle case when there are fewer destination samples than source samples (downsampling/minification).
+
+			// stretched half width of filter
+			half_width = (filter_support / xscale) * filter_scale;
+
+			// Find the range of source sample(s) that will contribute to each destination sample.
+
+			for (i = 0, n = 0; i < dst_x; i++)
+			{
+				// Convert from discrete to continuous coordinates, scale, then convert back to discrete.
+				center = ((Resample_Real)i + NUDGE) / xscale;
+				center -= NUDGE;
+				center += src_ofs;
+
+				left = cast_to_int((Resample_Real)floor(center - half_width));
+				right = cast_to_int((Resample_Real)ceil(center + half_width));
+
+				Pcontrib_bounds[i].center = center;
+				Pcontrib_bounds[i].left = left;
+				Pcontrib_bounds[i].right = right;
+
+				n += (right - left + 1);
+			}
+
+			// Allocate memory for contributors. 
+
+			if ((n == 0) || ((Pcpool = (Contrib*)calloc(n, sizeof(Contrib))) == NULL))
+			{
+				free(Pcontrib);
+				free(Pcontrib_bounds);
+				return NULL;
+			}
+			total = n;
+
+			Pcpool_next = Pcpool;
+
+			// Create the list of source samples which contribute to each destination sample.
+
+			for (i = 0; i < dst_x; i++)
+			{
+				int max_k = -1;
+				Resample_Real max_w = -1e+20f;
+
+				center = Pcontrib_bounds[i].center;
+				left = Pcontrib_bounds[i].left;
+				right = Pcontrib_bounds[i].right;
+
+				Pcontrib[i].n = 0;
+				Pcontrib[i].p = Pcpool_next;
+				Pcpool_next += (right - left + 1);
+				assert((Pcpool_next - Pcpool) <= total);
+
+				total_weight = 0;
+
+				for (j = left; j <= right; j++)
+					total_weight += (*Pfilter)((center - (Resample_Real)j) * xscale * oo_filter_scale);
+				const Resample_Real norm = static_cast<Resample_Real>(1.0f / total_weight);
+
+				total_weight = 0;
+
+#if RESAMPLER_DEBUG
+				printf("%i: ", i);
+#endif
+
+				for (j = left; j <= right; j++)
+				{
+					weight = (*Pfilter)((center - (Resample_Real)j) * xscale * oo_filter_scale) * norm;
+					if (weight == 0.0f)
+						continue;
+
+					n = reflect(j, src_x, boundary_op);
+
+#if RESAMPLER_DEBUG
+					printf("%i(%f), ", n, weight);
+#endif
+
+					// Increment the number of source samples which contribute to the current destination sample.
+
+					k = Pcontrib[i].n++;
+
+					Pcontrib[i].p[k].pixel = (unsigned short)n; /* store src sample number */
+					Pcontrib[i].p[k].weight = weight;           /* store src sample weight */
+
+					total_weight += weight; /* total weight of all contributors */
+
+					if (weight > max_w)
+					{
+						max_w = weight;
+						max_k = k;
+					}
+				}
+
+#if RESAMPLER_DEBUG
+				printf("\n\n");
+#endif
+
+				//assert(Pcontrib[i].n);
+				//assert(max_k != -1);
+				if ((max_k == -1) || (Pcontrib[i].n == 0))
+				{
+					free(Pcpool);
+					free(Pcontrib);
+					free(Pcontrib_bounds);
+					return NULL;
+				}
+
+				if (total_weight != 1.0f)
+					Pcontrib[i].p[max_k].weight += 1.0f - total_weight;
+			}
+		}
+		else
+		{
+			// Handle case when there are more destination samples than source samples (upsampling).
+
+			half_width = filter_support * filter_scale;
+
+			// Find the source sample(s) that contribute to each destination sample.
+
+			for (i = 0, n = 0; i < dst_x; i++)
+			{
+				// Convert from discrete to continuous coordinates, scale, then convert back to discrete.
+				center = ((Resample_Real)i + NUDGE) / xscale;
+				center -= NUDGE;
+				center += src_ofs;
+
+				left = cast_to_int((Resample_Real)floor(center - half_width));
+				right = cast_to_int((Resample_Real)ceil(center + half_width));
+
+				Pcontrib_bounds[i].center = center;
+				Pcontrib_bounds[i].left = left;
+				Pcontrib_bounds[i].right = right;
+
+				n += (right - left + 1);
+			}
+
+			/* Allocate memory for contributors. */
+
+			int total = n;
+			if ((total == 0) || ((Pcpool = (Contrib*)calloc(total, sizeof(Contrib))) == NULL))
+			{
+				free(Pcontrib);
+				free(Pcontrib_bounds);
+				return NULL;
+			}
+
+			Pcpool_next = Pcpool;
+
+			// Create the list of source samples which contribute to each destination sample.
+
+			for (i = 0; i < dst_x; i++)
+			{
+				int max_k = -1;
+				Resample_Real max_w = -1e+20f;
+
+				center = Pcontrib_bounds[i].center;
+				left = Pcontrib_bounds[i].left;
+				right = Pcontrib_bounds[i].right;
+
+				Pcontrib[i].n = 0;
+				Pcontrib[i].p = Pcpool_next;
+				Pcpool_next += (right - left + 1);
+				assert((Pcpool_next - Pcpool) <= total);
+
+				total_weight = 0;
+				for (j = left; j <= right; j++)
+					total_weight += (*Pfilter)((center - (Resample_Real)j) * oo_filter_scale);
+
+				const Resample_Real norm = static_cast<Resample_Real>(1.0f / total_weight);
+
+				total_weight = 0;
+
+#if RESAMPLER_DEBUG
+				printf("%i: ", i);
+#endif
+
+				for (j = left; j <= right; j++)
+				{
+					weight = (*Pfilter)((center - (Resample_Real)j) * oo_filter_scale) * norm;
+					if (weight == 0.0f)
+						continue;
+
+					n = reflect(j, src_x, boundary_op);
+
+#if RESAMPLER_DEBUG
+					printf("%i(%f), ", n, weight);
+#endif
+
+					// Increment the number of source samples which contribute to the current destination sample.
+
+					k = Pcontrib[i].n++;
+
+					Pcontrib[i].p[k].pixel = (unsigned short)n; /* store src sample number */
+					Pcontrib[i].p[k].weight = weight;           /* store src sample weight */
+
+					total_weight += weight; /* total weight of all contributors */
+
+					if (weight > max_w)
+					{
+						max_w = weight;
+						max_k = k;
+					}
+				}
+
+#if RESAMPLER_DEBUG
+				printf("\n\n");
+#endif
+
+				//assert(Pcontrib[i].n);
+				//assert(max_k != -1);
+
+				if ((max_k == -1) || (Pcontrib[i].n == 0))
+				{
+					free(Pcpool);
+					free(Pcontrib);
+					free(Pcontrib_bounds);
+					return NULL;
+				}
+
+				if (total_weight != 1.0f)
+					Pcontrib[i].p[max_k].weight += 1.0f - total_weight;
+			}
+		}
+
+#if RESAMPLER_DEBUG
+		printf("*******\n");
+#endif
+
+		free(Pcontrib_bounds);
+
+		return Pcontrib;
+	}
+
+	void Resampler::resample_x(Sample * Pdst, const Sample * Psrc)
+	{
+		assert(Pdst);
+		assert(Psrc);
+
+		int i, j;
+		Sample total;
+		Contrib_List* Pclist = m_Pclist_x;
+		Contrib* p;
+
+		for (i = m_resample_dst_x; i > 0; i--, Pclist++)
+		{
+#if BASISU_RESAMPLER_DEBUG_OPS
+			total_ops += Pclist->n;
+#endif
+
+			for (j = Pclist->n, p = Pclist->p, total = 0; j > 0; j--, p++)
+				total += Psrc[p->pixel] * p->weight;
+
+			*Pdst++ = total;
+		}
+	}
+
+	void Resampler::scale_y_mov(Sample * Ptmp, const Sample * Psrc, Resample_Real weight, int dst_x)
+	{
+		int i;
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+		total_ops += dst_x;
+#endif
+
+		// Not += because temp buf wasn't cleared.
+		for (i = dst_x; i > 0; i--)
+			* Ptmp++ = *Psrc++ * weight;
+	}
+
+	void Resampler::scale_y_add(Sample * Ptmp, const Sample * Psrc, Resample_Real weight, int dst_x)
+	{
+#if BASISU_RESAMPLER_DEBUG_OPS
+		total_ops += dst_x;
+#endif
+
+		for (int i = dst_x; i > 0; i--)
+			(*Ptmp++) += *Psrc++ * weight;
+	}
+
+	void Resampler::clamp(Sample * Pdst, int n)
+	{
+		while (n > 0)
+		{
+			Sample x = *Pdst;
+			*Pdst++ = clamp_sample(x);
+			n--;
+		}
+	}
+
+	void Resampler::resample_y(Sample * Pdst)
+	{
+		int i, j;
+		Sample* Psrc;
+		Contrib_List* Pclist = &m_Pclist_y[m_cur_dst_y];
+
+		Sample* Ptmp = m_delay_x_resample ? m_Ptmp_buf : Pdst;
+		assert(Ptmp);
+
+		/* Process each contributor. */
+
+		for (i = 0; i < Pclist->n; i++)
+		{
+			// locate the contributor's location in the scan buffer -- the contributor must always be found!
+			for (j = 0; j < MAX_SCAN_BUF_SIZE; j++)
+				if (m_Pscan_buf->scan_buf_y[j] == Pclist->p[i].pixel)
+					break;
+
+			assert(j < MAX_SCAN_BUF_SIZE);
+
+			Psrc = m_Pscan_buf->scan_buf_l[j];
+
+			if (!i)
+				scale_y_mov(Ptmp, Psrc, Pclist->p[i].weight, m_intermediate_x);
+			else
+				scale_y_add(Ptmp, Psrc, Pclist->p[i].weight, m_intermediate_x);
+
+			/* If this source line doesn't contribute to any
+			* more destination lines then mark the scanline buffer slot
+			* which holds this source line as free.
+			* (The max. number of slots used depends on the Y
+			* axis sampling factor and the scaled filter width.)
+			*/
+
+			if (--m_Psrc_y_count[resampler_range_check(Pclist->p[i].pixel, m_resample_src_y)] == 0)
+			{
+				m_Psrc_y_flag[resampler_range_check(Pclist->p[i].pixel, m_resample_src_y)] = false;
+				m_Pscan_buf->scan_buf_y[j] = -1;
+			}
+		}
+
+		/* Now generate the destination line */
+
+		if (m_delay_x_resample) // Was X resampling delayed until after Y resampling?
+		{
+			assert(Pdst != Ptmp);
+			resample_x(Pdst, Ptmp);
+		}
+		else
+		{
+			assert(Pdst == Ptmp);
+		}
+
+		if (m_lo < m_hi)
+			clamp(Pdst, m_resample_dst_x);
+	}
+
+	bool Resampler::put_line(const Sample * Psrc)
+	{
+		int i;
+
+		if (m_cur_src_y >= m_resample_src_y)
+			return false;
+
+		/* Does this source line contribute
+		* to any destination line? if not,
+		* exit now.
+		*/
+
+		if (!m_Psrc_y_count[resampler_range_check(m_cur_src_y, m_resample_src_y)])
+		{
+			m_cur_src_y++;
+			return true;
+		}
+
+		/* Find an empty slot in the scanline buffer. (FIXME: Perf. is terrible here with extreme scaling ratios.) */
+
+		for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+			if (m_Pscan_buf->scan_buf_y[i] == -1)
+				break;
+
+		/* If the buffer is full, exit with an error. */
+
+		if (i == MAX_SCAN_BUF_SIZE)
+		{
+			m_status = STATUS_SCAN_BUFFER_FULL;
+			return false;
+		}
+
+		m_Psrc_y_flag[resampler_range_check(m_cur_src_y, m_resample_src_y)] = true;
+		m_Pscan_buf->scan_buf_y[i] = m_cur_src_y;
+
+		/* Does this slot have any memory allocated to it? */
+
+		if (!m_Pscan_buf->scan_buf_l[i])
+		{
+			if ((m_Pscan_buf->scan_buf_l[i] = (Sample*)malloc(m_intermediate_x * sizeof(Sample))) == NULL)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return false;
+			}
+		}
+
+		// Resampling on the X axis first?
+		if (m_delay_x_resample)
+		{
+			assert(m_intermediate_x == m_resample_src_x);
+
+			// Y-X resampling order
+			memcpy(m_Pscan_buf->scan_buf_l[i], Psrc, m_intermediate_x * sizeof(Sample));
+		}
+		else
+		{
+			assert(m_intermediate_x == m_resample_dst_x);
+
+			// X-Y resampling order
+			resample_x(m_Pscan_buf->scan_buf_l[i], Psrc);
+		}
+
+		m_cur_src_y++;
+
+		return true;
+	}
+
+	const Resampler::Sample* Resampler::get_line()
+	{
+		int i;
+
+		/* If all the destination lines have been
+		* generated, then always return NULL.
+		*/
+
+		if (m_cur_dst_y == m_resample_dst_y)
+			return NULL;
+
+		/* Check to see if all the required
+		* contributors are present, if not,
+		* return NULL.
+		*/
+
+		for (i = 0; i < m_Pclist_y[m_cur_dst_y].n; i++)
+			if (!m_Psrc_y_flag[resampler_range_check(m_Pclist_y[m_cur_dst_y].p[i].pixel, m_resample_src_y)])
+				return NULL;
+
+		resample_y(m_Pdst_buf);
+
+		m_cur_dst_y++;
+
+		return m_Pdst_buf;
+	}
+
+	Resampler::~Resampler()
+	{
+		int i;
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+		printf("actual ops: %i\n", total_ops);
+#endif
+
+		free(m_Pdst_buf);
+		m_Pdst_buf = NULL;
+
+		if (m_Ptmp_buf)
+		{
+			free(m_Ptmp_buf);
+			m_Ptmp_buf = NULL;
+		}
+
+		/* Don't deallocate a contibutor list
+		* if the user passed us one of their own.
+	*/
+
+		if ((m_Pclist_x) && (!m_clist_x_forced))
+		{
+			free(m_Pclist_x->p);
+			free(m_Pclist_x);
+			m_Pclist_x = NULL;
+		}
+
+		if ((m_Pclist_y) && (!m_clist_y_forced))
+		{
+			free(m_Pclist_y->p);
+			free(m_Pclist_y);
+			m_Pclist_y = NULL;
+		}
+
+		free(m_Psrc_y_count);
+		m_Psrc_y_count = NULL;
+
+		free(m_Psrc_y_flag);
+		m_Psrc_y_flag = NULL;
+
+		if (m_Pscan_buf)
+		{
+			for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+				free(m_Pscan_buf->scan_buf_l[i]);
+
+			free(m_Pscan_buf);
+			m_Pscan_buf = NULL;
+		}
+	}
+
+	void Resampler::restart()
+	{
+		if (STATUS_OKAY != m_status)
+			return;
+
+		m_cur_src_y = m_cur_dst_y = 0;
+
+		int i, j;
+		for (i = 0; i < m_resample_src_y; i++)
+		{
+			m_Psrc_y_count[i] = 0;
+			m_Psrc_y_flag[i] = false;
+		}
+
+		for (i = 0; i < m_resample_dst_y; i++)
+		{
+			for (j = 0; j < m_Pclist_y[i].n; j++)
+				m_Psrc_y_count[resampler_range_check(m_Pclist_y[i].p[j].pixel, m_resample_src_y)]++;
+		}
+
+		for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+		{
+			m_Pscan_buf->scan_buf_y[i] = -1;
+
+			free(m_Pscan_buf->scan_buf_l[i]);
+			m_Pscan_buf->scan_buf_l[i] = NULL;
+		}
+	}
+
+	Resampler::Resampler(int src_x, int src_y,
+		int dst_x, int dst_y,
+		Boundary_Op boundary_op,
+		Resample_Real sample_low, Resample_Real sample_high,
+		const char* Pfilter_name,
+		Contrib_List * Pclist_x,
+		Contrib_List * Pclist_y,
+		Resample_Real filter_x_scale,
+		Resample_Real filter_y_scale,
+		Resample_Real src_x_ofs,
+		Resample_Real src_y_ofs)
+	{
+		int i, j;
+		Resample_Real support, (*func)(Resample_Real);
+
+		assert(src_x > 0);
+		assert(src_y > 0);
+		assert(dst_x > 0);
+		assert(dst_y > 0);
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+		total_ops = 0;
+#endif
+
+		m_lo = sample_low;
+		m_hi = sample_high;
+
+		m_delay_x_resample = false;
+		m_intermediate_x = 0;
+		m_Pdst_buf = NULL;
+		m_Ptmp_buf = NULL;
+		m_clist_x_forced = false;
+		m_Pclist_x = NULL;
+		m_clist_y_forced = false;
+		m_Pclist_y = NULL;
+		m_Psrc_y_count = NULL;
+		m_Psrc_y_flag = NULL;
+		m_Pscan_buf = NULL;
+		m_status = STATUS_OKAY;
+
+		m_resample_src_x = src_x;
+		m_resample_src_y = src_y;
+		m_resample_dst_x = dst_x;
+		m_resample_dst_y = dst_y;
+
+		m_boundary_op = boundary_op;
+
+		if ((m_Pdst_buf = (Sample*)malloc(m_resample_dst_x * sizeof(Sample))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		// Find the specified filter.
+
+		if (Pfilter_name == NULL)
+			Pfilter_name = BASISU_RESAMPLER_DEFAULT_FILTER;
+
+		for (i = 0; i < g_num_resample_filters; i++)
+			if (strcmp(Pfilter_name, g_resample_filters[i].name) == 0)
+				break;
+
+		if (i == g_num_resample_filters)
+		{
+			m_status = STATUS_BAD_FILTER_NAME;
+			return;
+		}
+
+		func = g_resample_filters[i].func;
+		support = g_resample_filters[i].support;
+
+		/* Create contributor lists, unless the user supplied custom lists. */
+
+		if (!Pclist_x)
+		{
+			m_Pclist_x = make_clist(m_resample_src_x, m_resample_dst_x, m_boundary_op, func, support, filter_x_scale, src_x_ofs);
+			if (!m_Pclist_x)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return;
+			}
+		}
+		else
+		{
+			m_Pclist_x = Pclist_x;
+			m_clist_x_forced = true;
+		}
+
+		if (!Pclist_y)
+		{
+			m_Pclist_y = make_clist(m_resample_src_y, m_resample_dst_y, m_boundary_op, func, support, filter_y_scale, src_y_ofs);
+			if (!m_Pclist_y)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return;
+			}
+		}
+		else
+		{
+			m_Pclist_y = Pclist_y;
+			m_clist_y_forced = true;
+		}
+
+		if ((m_Psrc_y_count = (int*)calloc(m_resample_src_y, sizeof(int))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		if ((m_Psrc_y_flag = (unsigned char*)calloc(m_resample_src_y, sizeof(unsigned char))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		// Count how many times each source line contributes to a destination line.
+
+		for (i = 0; i < m_resample_dst_y; i++)
+			for (j = 0; j < m_Pclist_y[i].n; j++)
+				m_Psrc_y_count[resampler_range_check(m_Pclist_y[i].p[j].pixel, m_resample_src_y)]++;
+
+		if ((m_Pscan_buf = (Scan_Buf*)malloc(sizeof(Scan_Buf))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+		{
+			m_Pscan_buf->scan_buf_y[i] = -1;
+			m_Pscan_buf->scan_buf_l[i] = NULL;
+		}
+
+		m_cur_src_y = m_cur_dst_y = 0;
+		{
+			// Determine which axis to resample first by comparing the number of multiplies required
+			// for each possibility.
+			int x_ops = count_ops(m_Pclist_x, m_resample_dst_x);
+			int y_ops = count_ops(m_Pclist_y, m_resample_dst_y);
+
+			// Hack 10/2000: Weight Y axis ops a little more than X axis ops.
+			// (Y axis ops use more cache resources.)
+			int xy_ops = x_ops * m_resample_src_y +
+				(4 * y_ops * m_resample_dst_x) / 3;
+
+			int yx_ops = (4 * y_ops * m_resample_src_x) / 3 +
+				x_ops * m_resample_dst_y;
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+			printf("src: %i %i\n", m_resample_src_x, m_resample_src_y);
+			printf("dst: %i %i\n", m_resample_dst_x, m_resample_dst_y);
+			printf("x_ops: %i\n", x_ops);
+			printf("y_ops: %i\n", y_ops);
+			printf("xy_ops: %i\n", xy_ops);
+			printf("yx_ops: %i\n", yx_ops);
+#endif
+
+			// Now check which resample order is better. In case of a tie, choose the order
+			// which buffers the least amount of data.
+			if ((xy_ops > yx_ops) ||
+				((xy_ops == yx_ops) && (m_resample_src_x < m_resample_dst_x)))
+			{
+				m_delay_x_resample = true;
+				m_intermediate_x = m_resample_src_x;
+			}
+			else
+			{
+				m_delay_x_resample = false;
+				m_intermediate_x = m_resample_dst_x;
+			}
+#if BASISU_RESAMPLER_DEBUG_OPS
+			printf("delaying: %i\n", m_delay_x_resample);
+#endif
+		}
+
+		if (m_delay_x_resample)
+		{
+			if ((m_Ptmp_buf = (Sample*)malloc(m_intermediate_x * sizeof(Sample))) == NULL)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return;
+			}
+		}
+	}
+
+	void Resampler::get_clists(Contrib_List * *ptr_clist_x, Contrib_List * *ptr_clist_y)
+	{
+		if (ptr_clist_x)
+			* ptr_clist_x = m_Pclist_x;
+
+		if (ptr_clist_y)
+			* ptr_clist_y = m_Pclist_y;
+	}
+
+	int Resampler::get_filter_num()
+	{
+		return g_num_resample_filters;
+	}
+
+	const char* Resampler::get_filter_name(int filter_num)
+	{
+		if ((filter_num < 0) || (filter_num >= g_num_resample_filters))
+			return NULL;
+		else
+			return g_resample_filters[filter_num].name;
+	}
+	
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_resampler.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_resampler.h
@ -0,0 +1,196 @@
+// basisu_resampler.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu.h"
+
+#define BASISU_RESAMPLER_DEBUG_OPS (0)
+#define BASISU_RESAMPLER_DEFAULT_FILTER "lanczos4"
+#define BASISU_RESAMPLER_MAX_DIMENSION (16384)
+
+namespace basisu
+{
+	// float or double
+	typedef float Resample_Real;
+
+	class Resampler
+	{
+	public:
+		typedef Resample_Real Sample;
+
+		struct Contrib
+		{
+			Resample_Real weight;
+			uint16_t pixel;
+		};
+
+		struct Contrib_List
+		{
+			uint16_t n;
+			Contrib *p;
+		};
+
+		enum Boundary_Op
+		{
+			BOUNDARY_WRAP = 0,
+			BOUNDARY_REFLECT = 1,
+			BOUNDARY_CLAMP = 2
+		};
+
+		enum Status
+		{
+			STATUS_OKAY = 0,
+			STATUS_OUT_OF_MEMORY = 1,
+			STATUS_BAD_FILTER_NAME = 2,
+			STATUS_SCAN_BUFFER_FULL = 3
+		};
+
+		// src_x/src_y - Input dimensions
+		// dst_x/dst_y - Output dimensions
+		// boundary_op - How to sample pixels near the image boundaries
+		// sample_low/sample_high - Clamp output samples to specified range, or disable clamping if sample_low >= sample_high
+		// Pclist_x/Pclist_y - Optional pointers to contributor lists from another instance of a Resampler
+		// src_x_ofs/src_y_ofs - Offset input image by specified amount (fractional values okay)
+		Resampler(
+			int src_x, int src_y,
+			int dst_x, int dst_y,
+			Boundary_Op boundary_op = BOUNDARY_CLAMP,
+			Resample_Real sample_low = 0.0f, Resample_Real sample_high = 0.0f,
+			const char *Pfilter_name = BASISU_RESAMPLER_DEFAULT_FILTER,
+			Contrib_List *Pclist_x = NULL,
+			Contrib_List *Pclist_y = NULL,
+			Resample_Real filter_x_scale = 1.0f,
+			Resample_Real filter_y_scale = 1.0f,
+			Resample_Real src_x_ofs = 0.0f,
+			Resample_Real src_y_ofs = 0.0f);
+
+		~Resampler();
+
+		// Reinits resampler so it can handle another frame.
+		void restart();
+
+		// false on out of memory.
+		bool put_line(const Sample *Psrc);
+
+		// NULL if no scanlines are currently available (give the resampler more scanlines!)
+		const Sample *get_line();
+
+		Status status() const
+		{
+			return m_status;
+		}
+
+		// Returned contributor lists can be shared with another Resampler.
+		void get_clists(Contrib_List **ptr_clist_x, Contrib_List **ptr_clist_y);
+		Contrib_List *get_clist_x() const
+		{
+			return m_Pclist_x;
+		}
+		Contrib_List *get_clist_y() const
+		{
+			return m_Pclist_y;
+		}
+
+		// Filter accessors.
+		static int get_filter_num();
+		static const char *get_filter_name(int filter_num);
+
+		static Contrib_List *make_clist(
+			int src_x, int dst_x, Boundary_Op boundary_op,
+			Resample_Real(*Pfilter)(Resample_Real),
+			Resample_Real filter_support,
+			Resample_Real filter_scale,
+			Resample_Real src_ofs);
+
+	private:
+		Resampler();
+		Resampler(const Resampler &o);
+		Resampler &operator=(const Resampler &o);
+
+#ifdef BASISU_RESAMPLER_DEBUG_OPS
+		int total_ops;
+#endif
+
+		int m_intermediate_x;
+
+		int m_resample_src_x;
+		int m_resample_src_y;
+		int m_resample_dst_x;
+		int m_resample_dst_y;
+
+		Boundary_Op m_boundary_op;
+
+		Sample *m_Pdst_buf;
+		Sample *m_Ptmp_buf;
+
+		Contrib_List *m_Pclist_x;
+		Contrib_List *m_Pclist_y;
+
+		bool m_clist_x_forced;
+		bool m_clist_y_forced;
+
+		bool m_delay_x_resample;
+
+		int *m_Psrc_y_count;
+		uint8_t *m_Psrc_y_flag;
+
+		// The maximum number of scanlines that can be buffered at one time.
+		enum
+		{
+			MAX_SCAN_BUF_SIZE = BASISU_RESAMPLER_MAX_DIMENSION
+		};
+
+		struct Scan_Buf
+		{
+			int scan_buf_y[MAX_SCAN_BUF_SIZE];
+			Sample *scan_buf_l[MAX_SCAN_BUF_SIZE];
+		};
+
+		Scan_Buf *m_Pscan_buf;
+
+		int m_cur_src_y;
+		int m_cur_dst_y;
+
+		Status m_status;
+
+		void resample_x(Sample *Pdst, const Sample *Psrc);
+		void scale_y_mov(Sample *Ptmp, const Sample *Psrc, Resample_Real weight, int dst_x);
+		void scale_y_add(Sample *Ptmp, const Sample *Psrc, Resample_Real weight, int dst_x);
+		void clamp(Sample *Pdst, int n);
+		void resample_y(Sample *Pdst);
+
+		static int reflect(const int j, const int src_x, const Boundary_Op boundary_op);
+
+		inline int count_ops(Contrib_List *Pclist, int k)
+		{
+			int i, t = 0;
+			for (i = 0; i < k; i++)
+				t += Pclist[i].n;
+			return (t);
+		}
+
+		Resample_Real m_lo;
+		Resample_Real m_hi;
+
+		inline Resample_Real clamp_sample(Resample_Real f) const
+		{
+			if (f < m_lo)
+				f = m_lo;
+			else if (f > m_hi)
+				f = m_hi;
+			return f;
+		}
+	};
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
@ -0,0 +1,35 @@
+// basisu_resampler_filters.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "../transcoder/basisu.h"
+
+namespace basisu
+{
+	typedef float (*resample_filter_func)(float t);
+
+	struct resample_filter
+	{
+		const char *name;
+		resample_filter_func func;
+		float support;
+	};
+
+	extern const resample_filter g_resample_filters[];
+	extern const int g_num_resample_filters;
+
+	int find_resample_filter(const char *pName);
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_ssim.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_ssim.cpp
@ -0,0 +1,408 @@
+// basisu_ssim.cpp
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_ssim.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace basisu
+{
+	float gauss(int x, int y, float sigma_sqr)
+	{
+		float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr)));
+		float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow;
+		return g;
+	}
+		
+	// size_x/y should be odd
+	void compute_gaussian_kernel(float *pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags)
+	{
+		assert(size_x & size_y & 1);
+
+		if (!(size_x | size_y))
+			return;
+
+		int mid_x = size_x / 2;
+		int mid_y = size_y / 2;
+
+		double sum = 0;
+		for (int x = 0; x < size_x; x++)
+		{
+			for (int y = 0; y < size_y; y++)
+			{
+				float g;
+				if ((x > mid_x) && (y < mid_y))
+					g = pDst[(size_x - x - 1) + y * size_x];
+				else if ((x < mid_x) && (y > mid_y))
+					g = pDst[x + (size_y - y - 1) * size_x];
+				else if ((x > mid_x) && (y > mid_y))
+					g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x];
+				else
+					g = gauss(x - mid_x, y - mid_y, sigma_sqr);
+
+				pDst[x + y * size_x] = g;
+				sum += g;
+			}
+		}
+
+		if (flags & cComputeGaussianFlagNormalizeCenterToOne)
+		{
+			sum = pDst[mid_x + mid_y * size_x];
+		}
+
+		if (flags & (cComputeGaussianFlagNormalizeCenterToOne | cComputeGaussianFlagNormalize))
+		{
+			double one_over_sum = 1.0f / sum;
+			for (int i = 0; i < size_x * size_y; i++)
+				pDst[i] = static_cast<float>(pDst[i] * one_over_sum);
+
+			if (flags & cComputeGaussianFlagNormalizeCenterToOne)
+				pDst[mid_x + mid_y * size_x] = 1.0f;
+		}
+
+		if (flags & cComputeGaussianFlagPrint)
+		{
+			printf("{\n");
+			for (int y = 0; y < size_y; y++)
+			{
+				printf("  ");
+				for (int x = 0; x < size_x; x++)
+				{
+					printf("%f, ", pDst[x + y * size_x]);
+				}
+				printf("\n");
+			}
+			printf("}");
+		}
+	}
+
+	void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor)
+	{
+		assert(odd_filter_width && (odd_filter_width & 1));
+		odd_filter_width |= 1;
+
+		vector2D<float> kernel(odd_filter_width, odd_filter_width);
+		compute_gaussian_kernel(kernel.get_ptr(), odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize);
+
+		const int dst_width = orig_img.get_width() / width_divisor;
+		const int dst_height = orig_img.get_height() / height_divisor;
+
+		const int H = odd_filter_width / 2;
+		const int L = -H;
+
+		dst.crop(dst_width, dst_height);
+
+//#pragma omp parallel for
+		for (int oy = 0; oy < dst_height; oy++)
+		{
+			for (int ox = 0; ox < dst_width; ox++)
+			{
+				vec4F c(0.0f);
+
+				for (int yd = L; yd <= H; yd++)
+				{
+					int y = oy * height_divisor + (height_divisor >> 1) + yd;
+
+					for (int xd = L; xd <= H; xd++)
+					{
+						int x = ox * width_divisor + (width_divisor >> 1) + xd;
+
+						const vec4F &p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping);
+
+						float w = kernel(xd + H, yd + H);
+						c[0] += p[0] * w;
+						c[1] += p[1] * w;
+						c[2] += p[2] * w;
+						c[3] += p[3] * w;
+					}
+				}
+
+				dst(ox, oy).set(c[0], c[1], c[2], c[3]);
+			}
+		}
+	}
+
+	void pow_image(const imagef &src, imagef &dst, const vec4F &power)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+
+				if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f))
+					dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]);
+				else
+					dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3]));
+			}
+		}
+	}
+
+	void mul_image(const imagef &src, imagef &dst, const vec4F &mul)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+				dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]);
+			}
+		}
+	}
+
+	void scale_image(const imagef &src, imagef &dst, const vec4F &scale, const vec4F &shift)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+
+				vec4F d;
+
+				for (uint32_t c = 0; c < 4; c++)
+					d[c] = scale[c] * p[c] + shift[c];
+
+				dst(x, y).set(d[0], d[1], d[2], d[3]);
+			}
+		}
+	}
+
+	void add_weighted_image(const imagef &src1, const vec4F &alpha, const imagef &src2, const vec4F &beta, const vec4F &gamma, imagef &dst)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				dst(x, y).set(
+					s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0],
+					s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1],
+					s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2],
+					s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]);
+			}
+		}
+	}
+
+	void add_image(const imagef &src1, const imagef &src2, imagef &dst)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]);
+			}
+		}
+	}
+
+	void adds_image(const imagef &src, const vec4F &value, imagef &dst)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+
+				dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]);
+			}
+		}
+	}
+
+	void mul_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				vec4F d;
+
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					float v1 = s1[c];
+					float v2 = s2[c];
+					d[c] = v1 * v2 * scale[c];
+				}
+
+				dst(x, y) = d;
+			}
+		}
+	}
+
+	void div_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				vec4F d;
+
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					float v = s2[c];
+					if (v == 0.0f)
+						d[c] = 0.0f;
+					else
+						d[c] = (s1[c] * scale[c]) / v;
+				}
+
+				dst(x, y) = d;
+			}
+		}
+	}
+
+	vec4F avg_image(const imagef &src)
+	{
+		vec4F avg(0.0f);
+
+		for (uint32_t y = 0; y < src.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < src.get_width(); x++)
+			{
+				const vec4F &s = src(x, y);
+
+				avg += vec4F(s[0], s[1], s[2], s[3]);
+			}
+		}
+
+		avg /= static_cast<float>(src.get_total_pixels());
+
+		return avg;
+	}
+		
+	// Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html
+	vec4F compute_ssim(const imagef &a, const imagef &b)
+	{
+		imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3;
+
+		const float C1 = 6.50250f, C2 = 58.52250f;
+				
+		pow_image(a, a_sq, vec4F(2));
+		pow_image(b, b_sq, vec4F(2));
+		mul_image(a, b, axb, vec4F(1.0f));
+
+		gaussian_filter(mu1, a, 11, 1.5f * 1.5f);
+		gaussian_filter(mu2, b, 11, 1.5f * 1.5f);
+
+		pow_image(mu1, mu1_sq, vec4F(2));
+		pow_image(mu2, mu2_sq, vec4F(2));
+		mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f));
+
+		gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f);
+		add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq);
+
+		gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f);
+		add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq);
+
+		gaussian_filter(s12, axb, 11, 1.5f * 1.5f);
+		add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12);
+
+		scale_image(mu1_mu2, t1, vec4F(2), vec4F(0));
+		adds_image(t1, vec4F(C1), t1);
+
+		scale_image(s12, t2, vec4F(2), vec4F(0));
+		adds_image(t2, vec4F(C2), t2);
+
+		mul_image(t1, t2, t3, vec4F(1));
+
+		add_image(mu1_sq, mu2_sq, t1);
+		adds_image(t1, vec4F(C1), t1);
+
+		add_image(s1_sq, s2_sq, t2);
+		adds_image(t2, vec4F(C2), t2);
+
+		mul_image(t1, t2, t1, vec4F(1));
+
+		div_image(t3, t1, smap, vec4F(1));
+
+		return avg_image(smap);
+	}
+
+	vec4F compute_ssim(const image &a, const image &b, bool luma, bool luma_601)
+	{
+		image ta(a), tb(b);
+
+		if ((ta.get_width() != tb.get_width()) || (ta.get_height() != tb.get_height()))
+		{
+			debug_printf("compute_ssim: Cropping input images to equal dimensions\n");
+
+			const uint32_t w = minimum(a.get_width(), b.get_width());
+			const uint32_t h = minimum(a.get_height(), b.get_height());
+			ta.crop(w, h);
+			tb.crop(w, h);
+		}
+
+		if (!ta.get_width() || !ta.get_height())
+		{
+			assert(0);
+			return vec4F(0);
+		}
+
+		if (luma)
+		{
+			for (uint32_t y = 0; y < ta.get_height(); y++)
+			{
+				for (uint32_t x = 0; x < ta.get_width(); x++)
+				{
+					ta(x, y).set(ta(x, y).get_luma(luma_601), ta(x, y).a);
+					tb(x, y).set(tb(x, y).get_luma(luma_601), tb(x, y).a);
+				}
+			}
+		}
+
+		imagef fta, ftb;
+
+		fta.set(ta);
+		ftb.set(tb);
+
+		return compute_ssim(fta, ftb);
+	}
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_ssim.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_ssim.h
@ -0,0 +1,44 @@
+// basisu_ssim.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_enc.h"
+
+namespace basisu
+{
+	float gauss(int x, int y, float sigma_sqr);
+
+	enum
+	{
+		cComputeGaussianFlagNormalize = 1,
+		cComputeGaussianFlagPrint = 2,
+		cComputeGaussianFlagNormalizeCenterToOne = 4
+	};
+
+	void compute_gaussian_kernel(float *pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags = 0);
+
+	void scale_image(const imagef &src, imagef &dst, const vec4F &scale, const vec4F &shift);
+	void add_weighted_image(const imagef &src1, const vec4F &alpha, const imagef &src2, const vec4F &beta, const vec4F &gamma, imagef &dst);
+	void add_image(const imagef &src1, const imagef &src2, imagef &dst);
+	void adds_image(const imagef &src, const vec4F &value, imagef &dst);
+	void mul_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale);
+	void div_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale);
+	vec4F avg_image(const imagef &src);
+
+	void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping = false, uint32_t width_divisor = 1, uint32_t height_divisor = 1);
+
+	vec4F compute_ssim(const imagef &a, const imagef &b);
+	vec4F compute_ssim(const image &a, const image &b, bool luma, bool luma_601);
+
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
+++ b/engine/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
--- a/engine/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
+++ b/engine/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
@ -0,0 +1,140 @@
+// basisu_uastc_enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_etc.h"
+
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+	const uint32_t TOTAL_PACK_UASTC_LEVELS = 5;
+
+	enum
+	{
+		// Fastest is the lowest quality, although it's stil substantially higher quality vs. BC1/ETC1. It supports 5 modes.
+		// The output may be somewhat blocky because this setting doesn't support 2/3-subset UASTC modes, but it should be less blocky vs. BC1/ETC1.
+		// This setting doesn't write BC1 hints, so BC1 transcoding will be slower. 
+		// Transcoded ETC1 quality will be lower because it only considers 2 hints out of 32.
+		// Avg. 43.45 dB
+		cPackUASTCLevelFastest = 0,
+		
+		// Faster is ~3x slower than fastest. It supports 9 modes.
+		// Avg. 46.49 dB
+		cPackUASTCLevelFaster = 1,
+		
+		// Default is ~5.5x slower than fastest. It supports 14 modes.
+		// Avg. 47.47 dB
+		cPackUASTCLevelDefault = 2,
+
+		// Slower is ~14.5x slower than fastest. It supports all 18 modes.
+		// Avg. 48.01 dB
+		cPackUASTCLevelSlower = 3,
+
+		// VerySlow is ~200x slower than fastest. 
+		// The best quality the codec is capable of, but you'll need to be patient or have a lot of cores.
+		// Avg. 48.24 dB
+		cPackUASTCLevelVerySlow = 4,
+
+		cPackUASTCLevelMask = 0xF,
+
+		// By default the encoder tries to strike a balance between UASTC and transcoded BC7 quality.
+		// These flags allow you to favor only optimizing for lowest UASTC error, or lowest BC7 error.
+		cPackUASTCFavorUASTCError = 8,
+		cPackUASTCFavorBC7Error = 16,
+						
+		cPackUASTCETC1FasterHints = 64,
+		cPackUASTCETC1FastestHints = 128,
+		cPackUASTCETC1DisableFlipAndIndividual = 256,
+		
+		// Favor UASTC modes 0 and 10 more than the others (this is experimental, it's useful for RDO compression)
+		cPackUASTCFavorSimplerModes = 512, 
+	};
+
+	// pRGBAPixels: Pointer to source 4x4 block of RGBA pixels (R first in memory).
+	// block: Reference to destination UASTC block.
+	// level: Controls compression speed vs. performance tradeoff.
+	void encode_uastc(const uint8_t* pRGBAPixels, basist::uastc_block& output_block, uint32_t flags = cPackUASTCLevelDefault);
+
+	struct uastc_encode_results
+	{
+		uint32_t m_uastc_mode;
+		uint32_t m_common_pattern;
+		basist::astc_block_desc m_astc;
+		color_rgba m_solid_color;
+		uint64_t m_astc_err;
+	};
+			  
+	void pack_uastc(basist::uastc_block& blk, const uastc_encode_results& result, const etc_block& etc1_blk, uint32_t etc1_bias, const eac_a8_block& etc_eac_a8_blk, bool bc1_hint0, bool bc1_hint1);
+
+	const uint32_t UASCT_RDO_DEFAULT_LZ_DICT_SIZE = 4096;
+
+	const float UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO = 10.0f;
+	const float UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH = 8.0f;
+	
+	// The RDO encoder computes a smoothness factor, from [0,1], for each block. To do this it computes each block's maximum component variance, then it divides this by this factor and clamps the result.
+	// Larger values will result in more blocks being protected from too much distortion.
+	const float UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV = 18.0f;
+	
+	// The RDO encoder can artifically boost the error of smooth blocks, in order to suppress distortions on smooth areas of the texture.
+	// The encoder will use this value as the maximum error scale to use on smooth blocks. The larger this value, the better smooth bocks will look. Set to 1.0 to disable this completely.
+	const float UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE = 10.0f;
+
+	struct uastc_rdo_params
+	{
+		uastc_rdo_params()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_lz_dict_size = UASCT_RDO_DEFAULT_LZ_DICT_SIZE;
+			m_lambda = 0.5f;
+			m_max_allowed_rms_increase_ratio = UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO;
+			m_skip_block_rms_thresh = UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH;
+			m_endpoint_refinement = true;
+			m_lz_literal_cost = 100;
+						
+			m_max_smooth_block_std_dev = UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV;
+			m_smooth_block_max_error_scale = UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE;
+		}
+				
+		// m_lz_dict_size: Size of LZ dictionary to simulate in bytes. The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
+		uint32_t m_lz_dict_size;
+
+		// m_lambda: The post-processor tries to reduce distortion+rate*lambda (rate is approximate LZ bits and distortion is scaled MS error).
+		// Larger values push the postprocessor towards optimizing more for lower rate, and smaller values more for distortion. 0=minimal distortion.
+		float m_lambda;
+		
+		// m_max_allowed_rms_increase_ratio: How much the RMS error of a block is allowed to increase before a trial is rejected. 1.0=no increase allowed, 1.05=5% increase allowed, etc.
+		float m_max_allowed_rms_increase_ratio;
+		
+		// m_skip_block_rms_thresh: Blocks with this much RMS error or more are completely skipped by the RDO encoder. 
+		float m_skip_block_rms_thresh;
+
+		// m_endpoint_refinement: If true, the post-process will attempt to refine the endpoints of blocks with modified selectors. 
+		bool m_endpoint_refinement;
+
+		float m_max_smooth_block_std_dev;
+		float m_smooth_block_max_error_scale;
+		
+		uint32_t m_lz_literal_cost;
+	};
+
+	// num_blocks, pBlocks: Number of blocks and pointer to UASTC blocks to process.
+	// pBlock_pixels: Pointer to an array of 4x4 blocks containing the original texture pixels. This is NOT a raster image, but a pointer to individual 4x4 blocks.
+	// flags: Pass in the same flags used to encode the UASTC blocks. The flags are used to reencode the transcode hints in the same way.
+	bool uastc_rdo(uint32_t num_blocks, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params &params, uint32_t flags = cPackUASTCLevelDefault, job_pool* pJob_pool = nullptr, uint32_t total_jobs = 0);
+} // namespace basisu
--- a/engine/thirdparty/basis_universal/encoder/cppspmd_flow.h
+++ b/engine/thirdparty/basis_universal/encoder/cppspmd_flow.h
@ -0,0 +1,590 @@
+// Do not include this header directly.
+// Control flow functionality in common between all the headers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef _DEBUG
+CPPSPMD_FORCE_INLINE void spmd_kernel::check_masks()
+{
+	assert(!any(andnot(m_kernel_exec, m_exec)));
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_break()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_continue()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	// Kill any active lanes, and remember which lanes were active so we can re-enable them at the end of the loop body.
+	m_continue_mask = m_continue_mask | m_exec;
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_return()
+{
+	// Permenantly kill all active lanes
+	m_kernel_exec = andnot(m_exec, m_kernel_exec);
+	m_exec = exec_mask::all_off();
+}
+			
+template<typename UnmaskedBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_unmasked(const UnmaskedBody& unmaskedBody)
+{
+	exec_mask orig_exec = m_exec, orig_kernel_exec = m_kernel_exec;
+
+	m_kernel_exec = exec_mask::all_on();
+	m_exec = exec_mask::all_on();
+
+	unmaskedBody();
+
+	m_kernel_exec = m_kernel_exec & orig_kernel_exec;
+	m_exec = m_exec & orig_exec;
+	
+	check_masks();
+}
+
+struct scoped_unmasked_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_kernel_exec;
+				
+	CPPSPMD_FORCE_INLINE scoped_unmasked_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_kernel_exec(pKernel->m_kernel_exec)
+	{
+		pKernel->m_kernel_exec = exec_mask::all_on();
+		pKernel->m_exec = exec_mask::all_on();
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_unmasked_restorer() 
+	{ 
+		m_pKernel->m_kernel_exec = m_pKernel->m_kernel_exec & m_orig_kernel_exec;
+		m_pKernel->m_exec = m_pKernel->m_exec & m_orig_exec;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_UNMASKED_BEGIN { scoped_unmasked_restorer _unmasked_restorer(this); 
+#define SPMD_UNMASKED_END }
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if_break(const vbool& cond)
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+	
+	exec_mask cond_exec(cond);
+					
+	m_exec = andnot(m_exec & cond_exec, m_exec);
+
+	check_masks();
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sif(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		const exec_mask orig_exec = m_exec;
+		m_exec = im;
+		ifBody();
+		m_exec = orig_exec;
+	}
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody)
+{
+	const exec_mask orig_exec = m_exec;
+
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		m_exec = im;
+		ifBody();
+	}
+
+	exec_mask em = orig_exec & exec_mask(!cond);
+
+	if (any(em))
+	{
+		m_exec = em;
+		elseBody();
+	}
+		
+	m_exec = orig_exec;
+}
+
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask cond_exec(cond);
+		
+	exec_mask pre_if_exec = cond_exec & m_exec;
+
+	if (any(pre_if_exec))
+	{
+		exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+		m_exec = pre_if_exec;
+
+		ifBody();
+
+		// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+		m_exec = m_exec | unexecuted_lanes;
+
+		check_masks();
+	}
+}
+
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody)
+{
+	bool all_flag = false;
+
+	exec_mask cond_exec(cond);
+		
+	{
+		exec_mask pre_if_exec = cond_exec & m_exec;
+
+		int mask = pre_if_exec.get_movemask();
+		if (mask != 0)
+		{
+			all_flag = ((uint32_t)mask == m_exec.get_movemask());
+
+			exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+
+	if (!all_flag)
+	{
+		exec_mask pre_if_exec = andnot(cond_exec, m_exec);
+
+		if (any(pre_if_exec))
+		{
+			exec_mask unexecuted_lanes = cond_exec & m_exec;
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+}
+
+struct scoped_exec_restorer
+{
+	exec_mask *m_pMask;
+	exec_mask m_prev_mask;
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer(exec_mask *pExec_mask) : m_pMask(pExec_mask), m_prev_mask(*pExec_mask) { }
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer() { *m_pMask = m_prev_mask; }
+};
+
+// Cannot use SPMD break, continue, or return inside "simple" if/else
+#define SPMD_SIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SENDIF }
+
+// Same as SPMD_SIF, except doesn't use a scoped object
+#define SPMD_SIF2(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE2(cond) m_exec = _orig_exec; } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SEND_IF2 m_exec = _orig_exec; }
+
+// Same as SPMD_SIF(), except the if/else blocks are always executed
+#define SPMD_SAIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAENDIF }
+
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SSELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+#define SPMD_SCASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); _select_executed = _select_executed | m_exec;
+
+//#define SPMD_SCASE_END			if (_select_executed.get_movemask() == _orig_exec.m_prev_mask.get_movemask()) break; }
+#define SPMD_SCASE_END			if (!any(_select_executed ^ _orig_exec.m_prev_mask)) break; }
+#define SPMD_SDEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); if (any(_all_other_lanes)) { m_exec = _all_other_lanes;
+#define SPMD_SDEFAULT_END		}
+#define SPMD_SSELECT_END		} while(0);
+
+// Same as SPMD_SSELECT, except all cases are executed.
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SASELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+
+#define SPMD_SACASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); { m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); \
+	_select_executed = _select_executed | m_exec;
+
+#define SPMD_SACASE_END			}
+#define SPMD_SADEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); { m_exec = _all_other_lanes;
+#define SPMD_SADEFAULT_END		}
+#define SPMD_SASELECT_END		} while(0);
+
+struct scoped_exec_restorer2
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_unexecuted_lanes;
+		
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer2(spmd_kernel *pKernel, const vbool &cond) : 
+		m_pKernel(pKernel)
+	{ 
+		exec_mask cond_exec(cond);
+		m_unexecuted_lanes = andnot(cond_exec, pKernel->m_exec);
+		pKernel->m_exec = cond_exec & pKernel->m_exec;
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer2() 
+	{ 
+		m_pKernel->m_exec = m_pKernel->m_exec | m_unexecuted_lanes;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_IF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); if (any(m_exec)) {
+#define SPMD_ELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); if (any(m_exec)) {
+#define SPMD_END_IF } }
+
+// Same as SPMD_IF, except the conditional block is always executed.
+#define SPMD_AIF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); {
+#define SPMD_AELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); {
+#define SPMD_AEND_IF } }
+
+class scoped_exec_saver
+{
+	exec_mask m_exec, m_kernel_exec, m_continue_mask;
+	spmd_kernel *m_pKernel;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+
+public:
+	inline scoped_exec_saver(spmd_kernel *pKernel) :
+		m_exec(pKernel->m_exec), m_kernel_exec(pKernel->m_kernel_exec), m_continue_mask(pKernel->m_continue_mask),
+		m_pKernel(pKernel)
+	{ 
+#ifdef _DEBUG
+		m_in_loop = pKernel->m_in_loop;
+#endif
+	}
+		
+	inline ~scoped_exec_saver()
+	{ 
+		m_pKernel->m_exec = m_exec; 
+		m_pKernel->m_continue_mask = m_continue_mask; 
+		m_pKernel->m_kernel_exec = m_kernel_exec; 
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#define SPMD_BEGIN_CALL scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_continue_mask = exec_mask::all_off();
+#define SPMD_BEGIN_CALL_ALL_LANES scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_exec = exec_mask::all_on(); m_continue_mask = exec_mask::all_off();
+
+template<typename ForeachBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_foreach(int begin, int end, const ForeachBody& foreachBody)
+{
+	if (begin == end)
+		return;
+	
+	if (!any(m_exec))
+		return;
+
+	// We don't support iterating backwards.
+	if (begin > end)
+		std::swap(begin, end);
+
+	exec_mask prev_continue_mask = m_continue_mask, prev_exec = m_exec;
+	
+	int total_full = (end - begin) / PROGRAM_COUNT;
+	int total_partial = (end - begin) % PROGRAM_COUNT;
+
+	lint_t loop_index = begin + program_index;
+	
+	const int total_loops = total_full + (total_partial ? 1 : 0);
+
+	m_continue_mask = exec_mask::all_off();
+
+	for (int i = 0; i < total_loops; i++)
+	{
+		int n = PROGRAM_COUNT;
+		if ((i == (total_loops - 1)) && (total_partial))
+		{
+			exec_mask partial_mask = exec_mask(vint_t(total_partial) > vint_t(program_index));
+			m_exec = m_exec & partial_mask;
+			n = total_partial;
+		}
+
+		foreachBody(loop_index, n);
+
+		m_exec = m_exec | m_continue_mask;
+		if (!any(m_exec))
+			break;
+
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+				
+		store_all(loop_index, loop_index + PROGRAM_COUNT);
+	}
+
+	m_exec = prev_exec & m_kernel_exec;
+	m_continue_mask = prev_continue_mask;
+	check_masks();
+}
+
+template<typename WhileCondBody, typename WhileBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(whileCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		whileBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+	}
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+#endif
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+	check_masks();
+}
+
+struct scoped_while_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_continue_mask;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+				
+	CPPSPMD_FORCE_INLINE scoped_while_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_continue_mask(pKernel->m_continue_mask)
+	{
+		pKernel->m_continue_mask.all_off();
+
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_while_restorer() 
+	{ 
+		m_pKernel->m_exec = m_orig_exec & m_pKernel->m_kernel_exec;
+		m_pKernel->m_continue_mask = m_orig_continue_mask;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#undef SPMD_WHILE
+#undef SPMD_WEND
+#define SPMD_WHILE(cond) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+
+#define SPMD_WEND m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); } }
+
+// Nesting is not supported (although it will compile, but the results won't make much sense).
+#define SPMD_FOREACH(loop_var, bi, ei) if (((bi) != (ei)) && (any(m_exec))) { \
+	scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	uint32_t b = (uint32_t)(bi), e = (uint32_t)(ei); if ((b) > (e)) { std::swap(b, e); } const uint32_t total_full = ((e) - (b)) >> PROGRAM_COUNT_SHIFT, total_partial = ((e) - (b)) & (PROGRAM_COUNT - 1); \
+	lint_t loop_var = program_index + (int)b; const uint32_t total_loops = total_full + (total_partial ? 1U : 0U); \
+	for (uint32_t CPPSPMD_GLUER2(_foreach_counter, __LINE__) = 0; CPPSPMD_GLUER2(_foreach_counter, __LINE__) < total_loops; ++CPPSPMD_GLUER2(_foreach_counter, __LINE__)) { \
+		if ((CPPSPMD_GLUER2(_foreach_counter, __LINE__) == (total_loops - 1)) && (total_partial)) { exec_mask partial_mask = exec_mask(vint_t((int)total_partial) > vint_t(program_index)); m_exec = m_exec & partial_mask; }
+
+#define SPMD_FOREACH_END(loop_var) m_exec = m_exec | m_continue_mask; if (!any(m_exec)) break; m_continue_mask = exec_mask::all_off(); check_masks(); store_all(loop_var, loop_var + PROGRAM_COUNT); } }
+
+// Okay to use spmd_continue or spmd_return, but not spmd_break
+#define SPMD_FOREACH_ACTIVE(index_var) int64_t index_var; { uint64_t _movemask = m_exec.get_movemask(); if (_movemask) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	for (uint32_t _i = 0; _i < PROGRAM_COUNT; ++_i) { \
+		if (_movemask & (1U << _i)) { \
+			m_exec.enable_lane(_i); m_exec = m_exec & m_kernel_exec; \
+			(index_var) = _i; \
+
+#define SPMD_FOREACH_ACTIVE_END } } } }
+
+// Okay to use spmd_continue, but not spmd_break/spmd_continue
+#define SPMD_FOREACH_UNIQUE_INT(index_var, var) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	CPPSPMD_DECL(int_t, _vals[PROGRAM_COUNT]); store_linear_all(_vals, var); std::sort(_vals, _vals + PROGRAM_COUNT); \
+	const int _n = (int)(std::unique(_vals, _vals + PROGRAM_COUNT) - _vals); \
+	for (int _i = 0; _i < _n; ++_i) { int index_var = _vals[_i]; vbool cond = (vint_t(var) == vint_t(index_var)); m_exec = exec_mask(cond);
+
+#define SPMD_FOREACH_UNIQUE_INT_END } }
+
+struct scoped_simple_while_restorer
+{
+	spmd_kernel* m_pKernel;
+	exec_mask m_orig_exec;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+
+	CPPSPMD_FORCE_INLINE scoped_simple_while_restorer(spmd_kernel* pKernel) :
+		m_pKernel(pKernel),
+		m_orig_exec(pKernel->m_exec)
+	{
+			
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_simple_while_restorer()
+	{
+		m_pKernel->m_exec = m_orig_exec;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+// Cannot use SPMD break, continue, or return inside simple while
+
+#define SPMD_SWHILE(cond) { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	while(true) { \
+		exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_SWEND } }	
+
+// Cannot use SPMD break, continue, or return inside simple do
+#define SPMD_SDO { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) {
+#define SPMD_SEND_DO(cond) exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break; } }	
+
+#undef SPMD_FOR
+#undef SPMD_END_FOR
+#define SPMD_FOR(for_init, for_cond) { for_init; scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(for_cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_END_FOR(for_inc) m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); for_inc; } }
+		
+template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	forInitBody();
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(forCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		forBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+			
+		forIncrBody();
+	}
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+	check_masks();
+#endif
+}
--- a/engine/thirdparty/basis_universal/encoder/cppspmd_math.h
+++ b/engine/thirdparty/basis_universal/encoder/cppspmd_math.h
@ -0,0 +1,725 @@
+// Do not include this header directly.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The general goal of these vectorized estimated math functions is scalability/performance.
+// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either. 
+// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper 
+// engineering analysis before relying on them.
+// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
+// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	vfloat c = frac(abs(a * b_inv)) * abs(b); 
+	return spmd_ternaryf(a < 0, -c, c); 
+}
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	return frac(a * b_inv) * b; 
+}
+
+// Avoids dividing by zero or very small values.
+CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
+{
+	return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824
+
+	log2_est():
+	max abs err: 0.0000023076808731
+	max rel err: 0.0000000756678881
+	avg abs err: 0.0000007535452724
+	avg rel err: 0.0000000235117843
+
+	XMVectorLog2():
+	max abs err: 0.0000023329709933
+	max rel err: 0.0000000826961046
+	avg abs err: 0.0000007564889684
+	avg rel err: 0.0000000236051899
+
+	std::log2f():
+	max abs err: 0.0000020265979401
+	max rel err: 0.0000000626647654
+	avg abs err: 0.0000007494445227
+	avg rel err: 0.0000000233800985
+*/
+
+// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
+inline vfloat spmd_kernel::log2_est(vfloat v)
+{
+	vfloat signif, fexp;
+
+	// Just clamp to a very small value, instead of checking for invalid inputs.
+	vfloat x = max(v, 2.2e-38f);
+
+	/*
+	 * Assume IEEE representation, which is sgn(1):exp(8):frac(23)
+	 * representing (1+frac)*2^(exp-127).  Call 1+frac the significand
+	 */
+
+	 // get exponent
+	vint ux1_i = cast_vfloat_to_vint(x);
+
+	vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);
+
+	// actual exponent is exp-127, will subtract 127 later
+
+	vint ux2_i;
+	vfloat ux2_f;
+
+	vint greater = ux1_i & 0x00400000;  // true if signif > 1.5
+	SPMD_SIF(greater != 0)
+	{
+		// signif >= 1.5 so need to divide by 2.  Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1 
+		store_all(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f000000);
+
+		store_all(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		// 126 instead of 127 compensates for division by 2
+		store_all(fexp, vfloat(exp - 126));    
+	}
+	SPMD_SELSE(greater != 0)
+	{
+		// get signif by stuffing exp = 127 which corresponds to an exponent of 0
+		store(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f800000);
+
+		store(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		store(fexp, vfloat(exp - 127));
+	}
+	SPMD_SENDIF
+
+	store_all(signif, ux2_f);
+	store_all(signif, signif - 1.0f);
+
+	const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;
+
+	vfloat xm1 = signif;
+	vfloat xm1sqr = xm1 * xm1;
+		
+	return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));
+	
+	// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
+	//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
+}
+
+// Uses log2_est(), so this function must be <= the precision of that.
+inline vfloat spmd_kernel::log_est(vfloat v)
+{
+	return log2_est(v) * 0.693147181f;
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
+{
+	// Assume we're using equation (2)
+	store_all(adjustment, 0);
+	
+	// integer part of the input argument
+	vint int_arg = (vint)arg;
+	
+	// if frac(arg) is in [0.5, 1.0]...
+	SPMD_SIF((arg - int_arg) > 0.5f)   
+	{
+		store(adjustment, 1);
+		
+		// then change it to [0.0, 0.5]
+		store(arg, arg - 0.5f);
+	}
+	SPMD_SENDIF
+
+	// arg == just the fractional part
+	store_all(arg, arg - (vfloat)int_arg);
+   
+	// Now compute 2** (int) arg. 
+	store_all(int_arg, min(int_arg + 127, 254));
+	
+	store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range : -50.0000000000000000 49.9999940395355225, vals : 16777216
+	
+	exp2_est():
+	Total passed near - zero check : 16777216
+	Total sign diffs : 0
+	max abs err: 1668910609.7500000000000000
+	max rel err: 0.0000015642030031
+	avg abs err: 10793794.4007573910057545
+	avg rel err: 0.0000003890893282
+	 
+	XMVectorExp2():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1665552836.8750000000000000
+	max rel err: 0.0000114674862370
+	avg abs err: 10771868.2627860084176064
+	avg rel err: 0.0000011218880770
+
+	std::exp2f():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1591636585.6250000000000000
+	max rel err: 0.0000014849731018
+	avg abs err: 10775800.3204844966530800
+	avg rel err: 0.0000003851496422
+*/
+
+// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
+inline vfloat spmd_kernel::exp2_est(vfloat arg)
+{
+	SPMD_BEGIN_CALL
+
+	const vfloat P00 = +7.2152891521493f;
+	const vfloat P01 = +0.0576900723731f;
+	const vfloat Q00 = +20.8189237930062f;
+	const vfloat Q01 = +1.0f;
+	const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling 
+
+	vfloat result = 0.0f;
+
+	// Return 0 if arg is too large. 
+	// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
+	SPMD_IF(abs(arg) > 126.0f)
+	{
+		spmd_return();
+	}
+	SPMD_END_IF
+
+	// 2**(int(a))
+	vfloat two_int_a;                
+	
+	// set to 1 by reduce_expb
+	vint adjustment;
+	
+	// 0 if arg is +; 1 if negative
+	vint negative = 0;                 
+
+	// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
+	SPMD_SIF(arg < 0.0f)
+	{
+		store(arg, -arg);
+		store(negative, 1);
+	}
+	SPMD_SENDIF
+
+	store_all(arg, min(arg, 126.0f));
+
+	// reduce to [0.0, 0.5]
+	reduce_expb(arg, two_int_a, adjustment);
+
+	// The format of the polynomial is:
+	//  answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
+	//
+	//  The following computes the polynomial in several steps:
+
+	// Q(x**2)
+	vfloat Q = vfma(Q01, (arg * arg), Q00);
+	
+	// x*P(x**2)
+	vfloat x_P = arg * (vfma(P01, arg * arg, P00));
+	
+	vfloat answer = (Q + x_P) / (Q - x_P);
+
+	// Now correct for the scaling factor of 2**(int(a))
+	store_all(answer, answer * two_int_a);
+			
+	// If the result had a fractional part > 0.5, correct for that
+	store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));
+
+	// Correct for a negative input
+	SPMD_SIF(negative != 0)
+	{
+		store(answer, 1.0f / answer);
+	}
+	SPMD_SENDIF
+
+	store(result, answer);
+
+	return result;
+}
+
+inline vfloat spmd_kernel::exp_est(vfloat arg)
+{
+	// e^x = exp2(x / log_base_e(2))
+	// constant is 1.0/(log(2)/log(e)) or 1/log(2)
+	return exp2_est(arg * 1.44269504f);
+}
+
+inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
+{
+	return exp_est(log_est(arg1) * arg2);
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	Total near-zero: 144, output above near-zero tresh: 30
+	Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16777072
+	Total sign diffs: 5
+	max abs err: 0.0000031375306036
+	max rel err: 0.1140846017075028
+	avg abs err: 0.0000003026226621
+	avg rel err: 0.0000033564977623
+*/
+
+// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
+// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
+inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
+{
+	const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
+	const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
+	const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
+	const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
+	const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;
+
+	vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;
+
+	store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);
+
+	store_all(r1_y, frac(r1_x));                   
+	
+	store_all(r2_x, (vfloat)(r1_y < c1_x));        
+
+	store_all(r2_y, (vfloat)(r1_y >= c1_y));    
+	store_all(r2_z, (vfloat)(r1_y >= c1_z));    
+
+	store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));
+
+	store_all(r0_x, c0_x - r1_y);                
+	store_all(r0_y, c0_y - r1_y);                
+	store_all(r0_z, c0_z - r1_y);                
+	
+	store_all(r0_x, r0_x * r0_x);
+	store_all(r0_y, r0_y * r0_y);
+	store_all(r0_z, r0_z * r0_z);
+
+	store_all(r1_x, vfma(c2_x, r0_x, c2_z));           
+	store_all(r1_y, vfma(c2_y, r0_y, c2_w));           
+	store_all(r1_z, vfma(c2_x, r0_z, c2_z));           
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c3_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_x));
+		
+	store_all(r1_x, vfma(r1_x, r0_x, c3_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_z));
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c4_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_x));
+
+	store_all(r1_x, vfma(r1_x, r0_x, c4_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_z));
+
+	store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));
+
+	return r0_x;
+}
+
+// positive values only
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+	
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+	
+	return rcp_l * vfnma(rcp_l, q, 2.0f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat s = sign(t);
+	vfloat q = abs(t);
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+
+	return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
+}
+
+// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
+// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	return x * vfnma(xhalf * x, x, 1.5008909f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
+	vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
+	return x2;
+}
+
+// Math from: http://developer.download.nvidia.com/cg/atan2.html
+// TODO: Needs more validation, parameter checking.
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
+{
+	vfloat t1 = abs(y);
+	vfloat t3 = abs(x);
+	
+	vfloat t0 = max(t3, t1);
+	store_all(t1, min(t3, t1));
+
+	store_all(t3, t1 / t0);
+	
+	vfloat t4 = t3 * t3;
+	store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
+	store_all(t0, vfms(t0, t4, 0.121239071f));
+	store_all(t0, vfma(t0, t4, 0.195635925f));
+	store_all(t0, vfms(t0, t4, 0.332994597f));
+	store_all(t0, vfma(t0, t4, 0.999995630f));
+	store_all(t3, t0 * t3);
+
+	store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));
+
+	store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
+	store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));
+
+	return t3;
+}
+
+/*
+    clang 9.0.0 for win /fp:precise release
+	Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
+	Skipped angles near 90/270 within +- .001 radians.
+	Near-zero threshold: .0000125f
+	Near-zero output above check threshold: 1e-6f
+
+	Total near-zero: 144, output above near-zero tresh: 20
+	Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16766400
+	Total sign diffs: 5
+	max abs err: 1.4982600811139264
+	max rel err: 0.1459155900188041
+	avg rel err: 0.0000054659502568
+
+	XMVectorTan() precise:
+	Total near-zero: 144, output above near-zero tresh: 18
+	Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
+	Total near-zero sign diffs: 0
+	Total passed near-zero check: 16766400
+	Total sign diffs: 0
+	max abs err: 1.9883573246424930
+	max rel err: 0.1459724171926864
+	avg rel err: 0.0000054965766843
+
+	std::tanf():
+	Total near-zero: 144, output above near-zero tresh: 0
+	Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
+	Total near-zero sign diffs: 11
+	Total passed near-zero check: 16766400
+	Total sign diffs: 11
+	max abs err: 0.8989131818294709
+	max rel err: 0.0573181403173166
+	avg rel err: 0.0000030791301203
+	
+	Originally from:
+	http://www.ganssle.com/approx.htm
+*/
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
+{
+	// Original double version was 8.2 digits
+	//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
+	// Tuned float constants for lower avg rel error (without using FMA3):
+	const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
+	vfloat x2 = x * x;
+	return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
+}
+
+// Don't call this for angles close to 90/270!.
+inline vfloat spmd_kernel::tan_est(vfloat x)
+{
+	const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
+	CPPSPMD_DECL(const uint8_t, s_table0[16]) =	{ 128 + 0, 128 + 2, 128 + -2, 128 + 4,    128 + 0, 128 + 2, 128 + -2, 128 + 4,	  128 + 0, 128 + 2, 128 + -2, 128 + 4,   128 + 0, 128 + 2, 128 + -2, 128 + 4 };
+
+	vint table = init_lookup4(s_table0); // a load
+	vint sgn = cast_vfloat_to_vint(x) & 0x80000000;
+
+	store_all(x, abs(x));
+	vfloat orig_x = x;
+
+	vfloat q = x * fOneOverPi;
+	store_all(x, q - floor(q));
+
+	vfloat x4 = x * 4.0f;
+	vint octant = (vint)(x4);
+
+	vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);
+
+	vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle
+
+	vfloat bias = (vfloat)k + -128.0f;
+	vfloat y = x0 + bias;
+
+	vfloat z = tan82(y);
+
+	vfloat r;
+	
+	vbool octant_one_or_two = (octant == 1) || (octant == 2);
+
+	// SPMD optimization - skip costly divide if we can
+	if (spmd_any(octant_one_or_two))
+	{
+		const float fDivThresh = .4371e-7f;
+		vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));
+				
+		vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
+		store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
+	}
+	else
+	{
+		store_all(r, spmd_ternaryf(octant == 0, z, -z));
+	}
+		
+	// Small angle approximation, to decrease the max rel error near Pi.
+	SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
+	{
+		store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
+	}
+	SPMD_SENDIF
+
+	return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
+}
+
+inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
+{ 
+	store(x.a, 0xf1ea5eed); 
+	store(x.b, seed ^ 0xd8487b1f); 
+	store(x.c, seed ^ 0xdbadef9a); 
+	store(x.d, seed); 
+	for (int i = 0; i < 20; ++i) 
+		(void)get_randu(x); 
+}
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+// Returns 32-bit unsigned random numbers.
+inline vint spmd_kernel::get_randu(rand_context& x)
+{ 
+	vint e = x.a - VINT_ROT(x.b, 27); 
+	store(x.a, x.b ^ VINT_ROT(x.c, 17)); 
+	store(x.b, x.c + x.d); 
+	store(x.c, x.d + e); 
+	store(x.d, e + x.a);	
+	return x.d; 
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
+{
+	vint rnd = get_randu(x);
+
+	vint range = high - low;
+
+	vint rnd_range = mulhiu(rnd, range);
+	
+	return spmd_ternaryi(low < high, low + rnd_range, low);
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
+{
+	vint rndi = get_randu(x) & 0x7fffff;
+
+	vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);
+
+	return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
+{
+	const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+	const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
+	store_all(tab1, init_lookup4(tab1_bytes));
+	store_all(tab2, init_lookup4(tab2_bytes));
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
+{
+	vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
+	vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
+	vint r3 = r0 | r1;
+	return byteswap(r3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
+{
+	CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	vint tab = init_lookup4(s_tab);
+
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
+{
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	// x <= 0x3fffffff
+	vbool c3 = (x2 & 0xC0000000) == 0;
+	vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
+	vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);
+
+	// x <= 0x7fffffff
+	vbool c4 = (x3 & 0x80000000) == 0;
+	return spmd_ternaryi(c4, n3 + 1, n3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
+{
+	// cast the least significant bit in v to a float
+	vfloat f = (vfloat)(x & -x);
+	
+	// extract exponent and adjust
+	return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
+{
+	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);                    
+	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);     
+	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b) 
+{ 
+	return cmpeq_epi16(subs_epu16(a, b), vint(0)); 
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b) 
+{ 
+	return cmple_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
+{
+	return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
+}
+
+CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
+{
+	return cmpgt_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
+{
+	return cmpeq_epi16(a, b) | cmpgt_epi16(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
+{
+	return cmpge_epi16(b, a);
+}
+
+void spmd_kernel::print_vint(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_vbool(vbool v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i) ? 1 : 0); 
+	printf("\n"); 
+}
+	
+void spmd_kernel::print_vint_hex(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("0x%X ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_active_lanes(const char *pPrefix) 
+{ 
+	CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
+	memset(flags, 0, sizeof(flags));
+	storeu_linear(flags, vint(1));
+
+	if (pPrefix)
+		printf("%s", pPrefix);
+
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+	{
+		if (flags[i])
+			printf("%u ", i);
+	}
+	printf("\n");
+}
+	
+void spmd_kernel::print_vfloat(vfloat v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%f ", extract(v, i)); 
+	printf("\n"); 
+}
--- a/engine/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
+++ b/engine/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
@ -0,0 +1,89 @@
+// Do not include this header directly.
+// This header defines shared struct spmd_kernel helpers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// See cppspmd_math.h for detailed error statistics.
+
+CPPSPMD_FORCE_INLINE void reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment);
+CPPSPMD_FORCE_INLINE vfloat tan56(vfloat x);
+CPPSPMD_FORCE_INLINE vfloat tan82(vfloat x);
+
+inline vfloat log2_est(vfloat v);
+
+inline vfloat log_est(vfloat v);
+
+inline vfloat exp2_est(vfloat arg);
+
+inline vfloat exp_est(vfloat arg);
+
+inline vfloat pow_est(vfloat arg1, vfloat arg2);
+
+CPPSPMD_FORCE_INLINE vfloat recip_est1(const vfloat& q);
+CPPSPMD_FORCE_INLINE vfloat recip_est1_pn(const vfloat& q);
+
+inline vfloat mod_angles(vfloat a);
+
+inline vfloat sincos_est_a(vfloat a, bool sin_flag);
+CPPSPMD_FORCE_INLINE vfloat sin_est_a(vfloat a) { return sincos_est_a(a, true); }
+CPPSPMD_FORCE_INLINE vfloat cos_est_a(vfloat a) { return sincos_est_a(a, false); }
+
+inline vfloat sin_est(vfloat a);
+
+inline vfloat cos_est(vfloat a);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est1(vfloat x0);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est2(vfloat x0);
+
+CPPSPMD_FORCE_INLINE vfloat atan2_est(vfloat y, vfloat x);
+
+CPPSPMD_FORCE_INLINE vfloat atan_est(vfloat x) { return atan2_est(x, vfloat(1.0f)); }
+
+// Don't call this for angles close to 90/270! 
+inline vfloat tan_est(vfloat x);
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+struct rand_context { vint a, b, c, d; };
+
+inline void seed_rand(rand_context& x, vint seed);
+
+// Returns 32-bit unsigned random numbers.
+inline vint get_randu(rand_context& x);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint get_randi(rand_context& x, vint low, vint high);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat get_randf(rand_context& x, vfloat low, vfloat high);
+
+CPPSPMD_FORCE_INLINE void init_reverse_bits(vint& tab1, vint& tab2);
+CPPSPMD_FORCE_INLINE vint reverse_bits(vint k, vint tab1, vint tab2);
+
+CPPSPMD_FORCE_INLINE vint count_leading_zeros(vint x);
+CPPSPMD_FORCE_INLINE vint count_leading_zeros_alt(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_trailing_zeros(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_set_bits(vint x);
+
+void print_vint(vint v);
+void print_vbool(vbool v);
+void print_vint_hex(vint v);
+void print_active_lanes(const char *pPrefix);
+void print_vfloat(vfloat v);
+
--- a/engine/thirdparty/basis_universal/encoder/cppspmd_sse.h
+++ b/engine/thirdparty/basis_universal/encoder/cppspmd_sse.h
--- a/engine/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
+++ b/engine/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
@ -0,0 +1,47 @@
+// cppspmd_type_aliases.h
+// Do not include this file directly
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef CPPSPMD_TYPES
+#define CPPSPMD_TYPES
+
+using exec_mask = CPPSPMD::exec_mask;
+
+#if CPPSPMD_INT16
+using vint16 = CPPSPMD::vint16;
+using int16_lref = CPPSPMD::int16_lref;
+using cint16_vref = CPPSPMD::cint16_vref;
+using int16_vref = CPPSPMD::int16_vref;
+using lint16 = CPPSPMD::lint16;
+using vint16_vref = CPPSPMD::vint16_vref;
+#else
+using vint = CPPSPMD::vint;
+using int_lref = CPPSPMD::int_lref;
+using cint_vref = CPPSPMD::cint_vref;
+using int_vref = CPPSPMD::int_vref;
+using lint = CPPSPMD::lint;
+using vint_vref = CPPSPMD::vint_vref;
+#endif
+
+using vbool = CPPSPMD::vbool;
+using vfloat = CPPSPMD::vfloat;
+using float_lref = CPPSPMD::float_lref;
+using float_vref = CPPSPMD::float_vref;
+using vfloat_vref = CPPSPMD::vfloat_vref;
+
+#endif // CPPSPMD_TYPES
--- a/engine/thirdparty/basis_universal/encoder/pvpngreader.cpp
+++ b/engine/thirdparty/basis_universal/encoder/pvpngreader.cpp
--- a/engine/thirdparty/basis_universal/encoder/pvpngreader.h
+++ b/engine/thirdparty/basis_universal/encoder/pvpngreader.h
@ -0,0 +1,48 @@
+// pngreader.h - Public Domain - see unlicense at bottom of pvpngreader.cpp
+#pragma once
+#include <stdint.h>
+
+namespace pv_png
+{
+	// PNG color types
+	enum
+	{
+		PNG_COLOR_TYPE_GREYSCALE = 0,
+		PNG_COLOR_TYPE_TRUECOLOR = 2,
+		PNG_COLOR_TYPE_PALETTIZED = 3,
+		PNG_COLOR_TYPE_GREYSCALE_ALPHA = 4,
+		PNG_COLOR_TYPE_TRUECOLOR_ALPHA = 6
+	};
+
+	// PNG file description
+	struct png_info
+	{
+		uint32_t m_width;
+		uint32_t m_height;
+				
+		uint32_t m_num_chans;	// The number of channels, factoring in transparency. Ranges from [1-4].
+
+		uint32_t m_bit_depth;	// PNG ihdr bit depth: 1, 2, 4, 8 or 16
+		uint32_t m_color_type;	// PNG ihdr color type, PNG_COLOR_TYPE_GRAYSCALE etc.
+
+		bool m_has_gamma;		// true if the PNG file had a GAMA chunk
+		uint32_t m_gamma_value; // PNG GAMA chunk value, scaled by 100000
+
+		bool m_has_trns;		// true if the PNG file used colorkey transparency
+	};
+
+	// Retrieved information about the PNG file.
+	// Returns false on any errors.
+	bool get_png_info(const void* pImage_buf, size_t buf_size, png_info& info);
+
+	// Input parameters:
+	// pImage_buf, buf_size - pointer to PNG image data
+	// desired_chans - desired number of output channels. 0=auto, 1=grayscale, 2=grayscale alpha, 3=24bpp RGB, 4=32bpp RGBA
+	//
+	// Output parameters:
+	// width, height - PNG image resolution
+	// num_chans - actual number of channels in PNG, from [1,4] (factoring in transparency)
+	//
+	// Returns nullptr on any errors.
+	void* load_png(const void* pImage_buf, size_t buf_size, uint32_t desired_chans, uint32_t &width, uint32_t &height, uint32_t& num_chans);
+}