feat: updated engine version to 4.4-rc1

2025-02-23 14:38:14 +01:00 · 2025-02-23 14:38:14 +01:00 · 21ba8e33af
commit 21ba8e33af
parent ee00efde1f
5459 changed files with 1128836 additions and 198305 deletions
--- a/engine/modules/betsy/CrossPlatformSettings_piece_all.glsl
+++ b/engine/modules/betsy/CrossPlatformSettings_piece_all.glsl
@ -0,0 +1,75 @@
+#define min3(a, b, c) min(a, min(b, c))
+#define max3(a, b, c) max(a, max(b, c))
+
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+
+#define int2 ivec2
+#define int3 ivec3
+#define int4 ivec4
+
+#define uint2 uvec2
+#define uint3 uvec3
+#define uint4 uvec4
+
+#define float2x2 mat2
+#define float3x3 mat3
+#define float4x4 mat4
+#define ogre_float4x3 mat3x4
+
+#define ushort uint
+#define ushort3 uint3
+#define ushort4 uint4
+
+//Short used for read operations. It's an int in GLSL & HLSL. An ushort in Metal
+#define rshort int
+#define rshort2 int2
+#define rint int
+//Short used for write operations. It's an int in GLSL. An ushort in HLSL & Metal
+#define wshort2 int2
+#define wshort3 int3
+
+#define toFloat3x3(x) mat3(x)
+#define buildFloat3x3(row0, row1, row2) mat3(row0, row1, row2)
+
+#define mul(x, y) ((x) * (y))
+#define saturate(x) clamp((x), 0.0, 1.0)
+#define lerp mix
+#define rsqrt inversesqrt
+#define INLINE
+#define NO_INTERPOLATION_PREFIX flat
+#define NO_INTERPOLATION_SUFFIX
+
+#define PARAMS_ARG_DECL
+#define PARAMS_ARG
+
+#define reversebits bitfieldReverse
+
+#define OGRE_Sample(tex, sampler, uv) texture(tex, uv)
+#define OGRE_SampleLevel(tex, sampler, uv, lod) textureLod(tex, uv, lod)
+#define OGRE_SampleArray2D(tex, sampler, uv, arrayIdx) texture(tex, vec3(uv, arrayIdx))
+#define OGRE_SampleArray2DLevel(tex, sampler, uv, arrayIdx, lod) textureLod(tex, vec3(uv, arrayIdx), lod)
+#define OGRE_SampleArrayCubeLevel(tex, sampler, uv, arrayIdx, lod) textureLod(tex, vec4(uv, arrayIdx), lod)
+#define OGRE_SampleGrad(tex, sampler, uv, ddx, ddy) textureGrad(tex, uv, ddx, ddy)
+#define OGRE_SampleArray2DGrad(tex, sampler, uv, arrayIdx, ddx, ddy) textureGrad(tex, vec3(uv, arrayIdx), ddx, ddy)
+#define OGRE_ddx(val) dFdx(val)
+#define OGRE_ddy(val) dFdy(val)
+#define OGRE_Load2D(tex, iuv, lod) texelFetch(tex, iuv, lod)
+#define OGRE_LoadArray2D(tex, iuv, arrayIdx, lod) texelFetch(tex, ivec3(iuv, arrayIdx), lod)
+#define OGRE_Load2DMS(tex, iuv, subsample) texelFetch(tex, iuv, subsample)
+
+#define OGRE_Load3D(tex, iuv, lod) texelFetch(tex, ivec3(iuv), lod)
+
+#define OGRE_GatherRed(tex, sampler, uv) textureGather(tex, uv, 0)
+#define OGRE_GatherGreen(tex, sampler, uv) textureGather(tex, uv, 1)
+#define OGRE_GatherBlue(tex, sampler, uv) textureGather(tex, uv, 2)
+
+#define bufferFetch1(buffer, idx) texelFetch(buffer, idx).x
+
+#define OGRE_SAMPLER_ARG_DECL(samplerName)
+#define OGRE_SAMPLER_ARG(samplerName)
+
+#define OGRE_Texture3D_float4 sampler3D
+#define OGRE_OUT_REF(declType, variableName) out declType variableName
+#define OGRE_INOUT_REF(declType, variableName) inout declType variableName
--- a/engine/modules/betsy/LICENSE.Betsy.md
+++ b/engine/modules/betsy/LICENSE.Betsy.md
@ -0,0 +1,18 @@
+Copyright 2020-2022 Matias N. Goldberg
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+This software uses code from:
+
+* [GPURealTimeBC6H](https://github.com/knarkowicz/GPURealTimeBC6H), under public domain. Modifications by Matias N. Goldberg
+* [rg-etc1](https://github.com/richgel999/rg-etc1/), Copyright (c) 2012 Rich Geldreich, zlib license. Extensive modifications by Matias N. Goldberg to adapt it as a compute shader
+* [stb_dxt](https://github.com/nothings/stb/blob/master/stb_dxt.h), under dual-license: A. MIT License
+Copyright (c) 2017 Sean Barrett, B. Public Domain (www.unlicense.org). Original by fabian "ryg" giesen - ported to C by stb. Modifications by Matias N. Goldberg to adapt it as a compute shader
+* EAC loosely inspired on [etc2_encoder](https://github.com/titilambert/packaging-efl/blob/master/src/static_libs/rg_etc/etc2_encoder.c), Copyright (C) 2014 Jean-Philippe ANDRE, 2-clause BSD license
+* ETC2 T & H modes based on [etc2_encoder](https://github.com/titilambert/packaging-efl/blob/master/src/static_libs/rg_etc/etc2_encoder.c), Copyright (C) 2014 Jean-Philippe ANDRE, 2-clause BSD license. A couple minor bugfixes applied by Matias N. Goldberg. Modifications made by Matias N. Goldberg to adapt it as a compute shader
+* ETC2 P very loosely based on [etc2_encoder](https://github.com/titilambert/packaging-efl/blob/master/src/static_libs/rg_etc/etc2_encoder.c), Copyright (C) 2014 Jean-Philippe ANDRE, 2-clause BSD license. Considerable rewrite by Matias N. Goldberg to enhance its quality.
--- a/engine/modules/betsy/SCsub
+++ b/engine/modules/betsy/SCsub
@ -0,0 +1,18 @@
+#!/usr/bin/env python
+from misc.utility.scons_hints import *
+
+Import("env")
+Import("env_modules")
+
+env_betsy = env_modules.Clone()
+
+# Betsy shaders, originally from https://github.com/darksylinc/betsy
+env_betsy.GLSL_HEADER("bc6h.glsl")
+env_betsy.GLSL_HEADER("bc1.glsl")
+env_betsy.GLSL_HEADER("bc4.glsl")
+env_betsy.GLSL_HEADER("alpha_stitch.glsl")
+
+env_betsy.Depends(Glob("*.glsl.gen.h"), ["#glsl_builders.py"])
+
+# Godot source files
+env_betsy.add_source_files(env.modules_sources, "*.cpp")
--- a/engine/modules/betsy/alpha_stitch.glsl
+++ b/engine/modules/betsy/alpha_stitch.glsl
@ -0,0 +1,23 @@
+// RGB and Alpha components of ETC2 RGBA are computed separately.
+// This compute shader merely stitches them together to form the final result
+// It's also used by RG11 driver to stitch two R11 into one RG11
+
+#[compute]
+#version 450
+
+#include "CrossPlatformSettings_piece_all.glsl"
+
+layout(local_size_x = 8, //
+		local_size_y = 8, //
+		local_size_z = 1) in;
+
+layout(binding = 0) uniform usampler2D srcRGB;
+layout(binding = 1) uniform usampler2D srcAlpha;
+layout(binding = 2, rgba32ui) uniform restrict writeonly uimage2D dstTexture;
+
+void main() {
+	uint2 rgbBlock = OGRE_Load2D(srcRGB, int2(gl_GlobalInvocationID.xy), 0).xy;
+	uint2 alphaBlock = OGRE_Load2D(srcAlpha, int2(gl_GlobalInvocationID.xy), 0).xy;
+
+	imageStore(dstTexture, int2(gl_GlobalInvocationID.xy), uint4(rgbBlock.xy, alphaBlock.xy));
+}
--- a/engine/modules/betsy/bc1.glsl
+++ b/engine/modules/betsy/bc1.glsl
@ -0,0 +1,491 @@
+#[versions]
+
+standard = "";
+dithered = "#define BC1_DITHER";
+
+#[compute]
+#version 450
+
+#include "CrossPlatformSettings_piece_all.glsl"
+
+#define FLT_MAX 340282346638528859811704183484516925440.0f
+
+layout(binding = 0) uniform sampler2D srcTex;
+layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
+
+layout(std430, binding = 2) readonly restrict buffer globalBuffer {
+	float2 c_oMatch5[256];
+	float2 c_oMatch6[256];
+};
+
+layout(push_constant, std430) uniform Params {
+	uint p_numRefinements;
+	uint p_padding[3];
+}
+params;
+
+layout(local_size_x = 8, //
+		local_size_y = 8, //
+		local_size_z = 1) in;
+
+float3 rgb565to888(float rgb565) {
+	float3 retVal;
+	retVal.x = floor(rgb565 / 2048.0f);
+	retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f);
+	retVal.z = floor(mod(rgb565, 32.0f));
+
+	// This is the correct 565 to 888 conversion:
+	//		rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
+	//
+	// However stb_dxt follows a different one:
+	//		rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
+	//		g  = floor( g  * ( 256 / 64 + 4 / 64 ) );
+	//
+	// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
+	// It's quite possible this is the reason:
+	//		http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
+	//
+	// Or maybe it's just because it's cheap to do with integer shifts.
+	// Anyway, we follow stb_dxt's conversion just in case
+	// (gives almost the same result, with 1 or -1 of difference for a very few values)
+	//
+	// Perhaps when we make 888 -> 565 -> 888 it doesn't matter
+	// because they end up mapping to the original number
+
+	return floor(retVal * float3(8.25f, 4.0625f, 8.25f));
+}
+
+float rgb888to565(float3 rgbValue) {
+	rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f);
+	rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f);
+
+	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
+}
+
+// linear interpolation at 1/3 point between a and b, using desired rounding type
+float3 lerp13(float3 a, float3 b) {
+#ifdef STB_DXT_USE_ROUNDING_BIAS
+	// with rounding bias
+	return a + floor((b - a) * (1.0f / 3.0f) + 0.5f);
+#else
+	// without rounding bias
+	return floor((2.0f * a + b) / 3.0f);
+#endif
+}
+
+/// Unpacks a block of 4 colors from two 16-bit endpoints
+void EvalColors(out float3 colors[4], float c0, float c1) {
+	colors[0] = rgb565to888(c0);
+	colors[1] = rgb565to888(c1);
+	colors[2] = lerp13(colors[0], colors[1]);
+	colors[3] = lerp13(colors[1], colors[0]);
+}
+
+/** The color optimization function. (Clever code, part 1)
+@param outMinEndp16 [out]
+	Minimum endpoint, in RGB565
+@param outMaxEndp16 [out]
+	Maximum endpoint, in RGB565
+*/
+void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) {
+	// determine color distribution
+	float3 avgColor;
+	float3 minColor;
+	float3 maxColor;
+
+	avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz;
+	for (int i = 1; i < 16; ++i) {
+		const float3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz;
+		avgColor += currColorUnorm;
+		minColor = min(minColor, currColorUnorm);
+		maxColor = max(maxColor, currColorUnorm);
+	}
+
+	avgColor = round(avgColor * 255.0f / 16.0f);
+	maxColor *= 255.0f;
+	minColor *= 255.0f;
+
+	// determine covariance matrix
+	float cov[6];
+	for (int i = 0; i < 6; ++i) {
+		cov[i] = 0;
+	}
+
+	for (int i = 0; i < 16; ++i) {
+		const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+		float3 rgbDiff = currColor - avgColor;
+
+		cov[0] += rgbDiff.r * rgbDiff.r;
+		cov[1] += rgbDiff.r * rgbDiff.g;
+		cov[2] += rgbDiff.r * rgbDiff.b;
+		cov[3] += rgbDiff.g * rgbDiff.g;
+		cov[4] += rgbDiff.g * rgbDiff.b;
+		cov[5] += rgbDiff.b * rgbDiff.b;
+	}
+
+	// convert covariance matrix to float, find principal axis via power iter
+	for (int i = 0; i < 6; ++i) {
+		cov[i] /= 255.0f;
+	}
+
+	float3 vF = maxColor - minColor;
+
+	const int nIterPower = 4;
+	for (int iter = 0; iter < nIterPower; ++iter) {
+		const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
+		const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
+		const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];
+
+		vF.r = r;
+		vF.g = g;
+		vF.b = b;
+	}
+
+	float magn = max3(abs(vF.r), abs(vF.g), abs(vF.b));
+	float3 v;
+
+	if (magn < 4.0f) { // too small, default to luminance
+		v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000.
+		v.g = 587.0f;
+		v.b = 114.0f;
+	} else {
+		v = trunc(vF * (512.0f / magn));
+	}
+
+	// Pick colors at extreme points
+	float3 minEndpoint, maxEndpoint;
+	float minDot = FLT_MAX;
+	float maxDot = -FLT_MAX;
+	for (int i = 0; i < 16; ++i) {
+		const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+		const float dotValue = dot(currColor, v);
+
+		if (dotValue < minDot) {
+			minDot = dotValue;
+			minEndpoint = currColor;
+		}
+
+		if (dotValue > maxDot) {
+			maxDot = dotValue;
+			maxEndpoint = currColor;
+		}
+	}
+
+	outMinEndp16 = rgb888to565(minEndpoint);
+	outMaxEndp16 = rgb888to565(maxEndpoint);
+}
+
+// The color matching function
+uint MatchColorsBlock(const uint srcPixelsBlock[16], float3 color[4]) {
+	uint mask = 0u;
+	float3 dir = color[0] - color[1];
+	float stops[4];
+
+	for (int i = 0; i < 4; ++i) {
+		stops[i] = dot(color[i], dir);
+	}
+
+	// think of the colors as arranged on a line; project point onto that line, then choose
+	// next color out of available ones. we compute the crossover points for "best color in top
+	// half"/"best in bottom half" and then the same inside that subinterval.
+	//
+	// relying on this 1d approximation isn't always optimal in terms of euclidean distance,
+	// but it's very close and a lot faster.
+	// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
+
+	float c0Point = trunc((stops[1] + stops[3]) * 0.5f);
+	float halfPoint = trunc((stops[3] + stops[2]) * 0.5f);
+	float c3Point = trunc((stops[2] + stops[0]) * 0.5f);
+
+#ifndef BC1_DITHER
+	// the version without dithering is straightforward
+	for (uint i = 16u; i-- > 0u;) {
+		const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+
+		const float dotValue = dot(currColor, dir);
+		mask <<= 2u;
+
+		if (dotValue < halfPoint) {
+			mask |= ((dotValue < c0Point) ? 1u : 3u);
+		} else {
+			mask |= ((dotValue < c3Point) ? 2u : 0u);
+		}
+	}
+#else
+	// with floyd-steinberg dithering
+	float4 ep1 = float4(0, 0, 0, 0);
+	float4 ep2 = float4(0, 0, 0, 0);
+
+	c0Point *= 16.0f;
+	halfPoint *= 16.0f;
+	c3Point *= 16.0f;
+
+	for (uint y = 0u; y < 4u; ++y) {
+		float ditherDot;
+		uint lmask, step;
+
+		float3 currColor;
+		float dotValue;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[0] = dotValue - stops[step];
+		lmask = step;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[1] = dotValue - stops[step];
+		lmask |= step << 2u;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[2] = dotValue - stops[step];
+		lmask |= step << 4u;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[3] = dotValue - stops[step];
+		lmask |= step << 6u;
+
+		mask |= lmask << (y * 8u);
+		{
+			float4 tmp = ep1;
+			ep1 = ep2;
+			ep2 = tmp;
+		} // swap
+	}
+#endif
+
+	return mask;
+}
+
+// The refinement function. (Clever code, part 2)
+// Tries to optimize colors to suit block contents better.
+// (By solving a least squares system via normal equations+Cramer's rule)
+bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
+		inout float inOutMaxEndp16) {
+	float newMin16, newMax16;
+	const float oldMin = inOutMinEndp16;
+	const float oldMax = inOutMaxEndp16;
+
+	if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index?
+	{
+		// yes, linear system would be singular; solve using optimal
+		// single-color match on average color
+		float3 rgbVal = float3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f);
+		for (int i = 0; i < 16; ++i) {
+			rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz;
+		}
+
+		rgbVal = floor(rgbVal * (255.0f / 16.0f));
+
+		newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + //
+				c_oMatch6[uint(rgbVal.g)][0] * 32.0f + //
+				c_oMatch5[uint(rgbVal.b)][0];
+		newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + //
+				c_oMatch6[uint(rgbVal.g)][1] * 32.0f + //
+				c_oMatch5[uint(rgbVal.b)][1];
+	} else {
+		const float w1Tab[4] = { 3, 0, 2, 1 };
+		const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f };
+		// ^some magic to save a lot of multiplies in the accumulating loop...
+		// (precomputed products of weights for least squares system, accumulated inside one 32-bit
+		// register)
+
+		float akku = 0.0f;
+		uint cm = mask;
+		float3 at1 = float3(0, 0, 0);
+		float3 at2 = float3(0, 0, 0);
+		for (int i = 0; i < 16; ++i, cm >>= 2u) {
+			const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+
+			const uint step = cm & 3u;
+			const float w1 = w1Tab[step];
+			akku += prods[step];
+			at1 += currColor * w1;
+			at2 += currColor;
+		}
+
+		at2 = 3.0f * at2 - at1;
+
+		// extract solutions and decide solvability
+		const float xx = floor(akku / 65535.0f);
+		const float yy = floor(mod(akku, 65535.0f) / 256.0f);
+		const float xy = mod(akku, 256.0f);
+
+		float2 f_rb_g;
+		f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy);
+		f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;
+
+		// solve.
+		const float3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f),
+				float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
+		newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;
+
+		const float3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f),
+				float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
+		newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
+	}
+
+	inOutMinEndp16 = newMin16;
+	inOutMaxEndp16 = newMax16;
+
+	return oldMin != newMin16 || oldMax != newMax16;
+}
+
+#ifdef BC1_DITHER
+/// Quantizes 'srcValue' which is originally in 888 (full range),
+/// converting it to 565 and then back to 888 (quantized)
+float3 quant(float3 srcValue) {
+	srcValue = clamp(srcValue, 0.0f, 255.0f);
+	// Convert 888 -> 565
+	srcValue = floor(srcValue * float3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f);
+	// Convert 565 -> 888 back
+	srcValue = floor(srcValue * float3(8.25f, 4.0625f, 8.25f));
+
+	return srcValue;
+}
+
+void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) {
+	float3 ep1[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };
+	float3 ep2[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };
+
+	for (uint y = 0u; y < 16u; y += 4u) {
+		float3 srcPixel, dithPixel;
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f;
+		dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f)));
+		ep1[0] = srcPixel - dithPixel;
+		dthPixBlck[y + 0u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f;
+		dithPixel = quant(
+				srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f)));
+		ep1[1] = srcPixel - dithPixel;
+		dthPixBlck[y + 1u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f;
+		dithPixel = quant(
+				srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f)));
+		ep1[2] = srcPixel - dithPixel;
+		dthPixBlck[y + 2u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f;
+		dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f)));
+		ep1[3] = srcPixel - dithPixel;
+		dthPixBlck[y + 3u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		// swap( ep1, ep2 )
+		for (uint i = 0u; i < 4u; ++i) {
+			float3 tmp = ep1[i];
+			ep1[i] = ep2[i];
+			ep2[i] = tmp;
+		}
+	}
+}
+#endif
+
+void main() {
+	uint srcPixelsBlock[16];
+
+	bool bAllColorsEqual = true;
+
+	// Load the whole 4x4 block
+	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
+	for (uint i = 0u; i < 16u; ++i) {
+		const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i & 0x03u, i >> 2u);
+		const float3 srcPixels0 = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyz;
+		srcPixelsBlock[i] = packUnorm4x8(float4(srcPixels0, 1.0f));
+		bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
+	}
+
+	float maxEndp16, minEndp16;
+	uint mask = 0u;
+
+	if (bAllColorsEqual) {
+		const uint3 rgbVal = uint3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f);
+		mask = 0xAAAAAAAAu;
+		maxEndp16 =
+				c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
+		minEndp16 =
+				c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
+	} else {
+#ifdef BC1_DITHER
+		uint ditherPixelsBlock[16];
+		// first step: compute dithered version for PCA if desired
+		DitherBlock(srcPixelsBlock, ditherPixelsBlock);
+#else
+#define ditherPixelsBlock srcPixelsBlock
+#endif
+
+		// second step: pca+map along principal axis
+		OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16);
+		if (minEndp16 != maxEndp16) {
+			float3 colors[4];
+			EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
+			mask = MatchColorsBlock(srcPixelsBlock, colors);
+		}
+
+		// third step: refine (multiple times if requested)
+		bool bStopRefinement = false;
+		for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) {
+			const uint lastMask = mask;
+
+			if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) {
+				if (minEndp16 != maxEndp16) {
+					float3 colors[4];
+					EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
+					mask = MatchColorsBlock(srcPixelsBlock, colors);
+				} else {
+					mask = 0u;
+					bStopRefinement = true;
+				}
+			}
+
+			bStopRefinement = mask == lastMask || bStopRefinement;
+		}
+	}
+
+	// write the color block
+	if (maxEndp16 < minEndp16) {
+		const float tmpValue = minEndp16;
+		minEndp16 = maxEndp16;
+		maxEndp16 = tmpValue;
+		mask ^= 0x55555555u;
+	}
+
+	uint2 outputBytes;
+	outputBytes.x = uint(maxEndp16) | (uint(minEndp16) << 16u);
+	outputBytes.y = mask;
+
+	uint2 dstUV = gl_GlobalInvocationID.xy;
+	imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
+}
--- a/engine/modules/betsy/bc4.glsl
+++ b/engine/modules/betsy/bc4.glsl
@ -0,0 +1,153 @@
+#[versions]
+
+unsigned = "";
+signed = "#define SNORM";
+
+#[compute]
+#version 450
+
+#include "CrossPlatformSettings_piece_all.glsl"
+
+#VERSION_DEFINES
+
+shared float2 g_minMaxValues[4u * 4u * 4u];
+shared uint2 g_mask[4u * 4u];
+
+layout(binding = 0) uniform sampler2D srcTex;
+layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
+
+layout(push_constant, std430) uniform Params {
+	uint p_channelIdx;
+	uint p_padding[3];
+}
+params;
+
+layout(local_size_x = 4, //
+		local_size_y = 4, //
+		local_size_z = 4) in;
+
+/// Each block is 16 pixels
+/// Each thread works on 4 pixels
+/// Therefore each block needs 4 threads, generating 8 masks
+/// At the end these 8 masks get merged into 2 and results written to output
+///
+/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
+///
+/// A: It's a sweetspot.
+///  - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
+///  - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
+///    overhead, and also more LDS usage which reduces occupancy.
+///  - Long threads (e.g. 1 thread per block) misses parallelism opportunities
+void main() {
+	float minVal, maxVal;
+	float4 srcPixel;
+
+	const uint blockThreadId = gl_LocalInvocationID.x;
+
+	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
+
+	for (uint i = 0u; i < 4u; ++i) {
+		const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i, blockThreadId);
+
+		const float4 value = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyzw;
+		srcPixel[i] = params.p_channelIdx == 0 ? value.x : (params.p_channelIdx == 1 ? value.y : value.w);
+		srcPixel[i] *= 255.0f;
+	}
+
+	minVal = min3(srcPixel.x, srcPixel.y, srcPixel.z);
+	maxVal = max3(srcPixel.x, srcPixel.y, srcPixel.z);
+	minVal = min(minVal, srcPixel.w);
+	maxVal = max(maxVal, srcPixel.w);
+
+	const uint minMaxIdxBase = (gl_LocalInvocationID.z << 4u) + (gl_LocalInvocationID.y << 2u);
+	const uint maskIdxBase = (gl_LocalInvocationID.z << 2u) + gl_LocalInvocationID.y;
+
+	g_minMaxValues[minMaxIdxBase + blockThreadId] = float2(minVal, maxVal);
+	g_mask[maskIdxBase] = uint2(0u, 0u);
+
+	memoryBarrierShared();
+	barrier();
+
+	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
+	for (uint i = 0u; i < 4u; ++i) {
+		minVal = min(g_minMaxValues[minMaxIdxBase + i].x, minVal);
+		maxVal = max(g_minMaxValues[minMaxIdxBase + i].y, maxVal);
+	}
+
+	// determine bias and emit color indices
+	// given the choice of maxVal/minVal, these indices are optimal:
+	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
+	float dist = maxVal - minVal;
+	float dist4 = dist * 4.0f;
+	float dist2 = dist * 2.0f;
+	float bias = (dist < 8) ? (dist - 1) : (trunc(dist * 0.5f) + 2);
+	bias -= minVal * 7;
+
+	uint mask0 = 0u, mask1 = 0u;
+
+	for (uint i = 0u; i < 4u; ++i) {
+		float a = srcPixel[i] * 7.0f + bias;
+
+		int ind = 0;
+
+		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
+		if (a >= dist4) {
+			ind = 4;
+			a -= dist4;
+		}
+
+		if (a >= dist2) {
+			ind += 2;
+			a -= dist2;
+		}
+
+		if (a >= dist) {
+			ind += 1;
+		}
+
+		// turn linear scale into DXT index (0/1 are extremal pts)
+		ind = -ind & 7;
+		ind ^= (2 > ind) ? 1 : 0;
+
+		// write index
+		const uint bits = 16u + ((blockThreadId << 2u) + i) * 3u;
+		if (bits < 32u) {
+			mask0 |= uint(ind) << bits;
+			if (bits + 3u > 32u) {
+				mask1 |= uint(ind) >> (32u - bits);
+			}
+		} else {
+			mask1 |= uint(ind) << (bits - 32u);
+		}
+	}
+
+	if (mask0 != 0u) {
+		atomicOr(g_mask[maskIdxBase].x, mask0);
+	}
+	if (mask1 != 0u) {
+		atomicOr(g_mask[maskIdxBase].y, mask1);
+	}
+
+	memoryBarrierShared();
+	barrier();
+
+	if (blockThreadId == 0u) {
+		// Save data
+		uint2 outputBytes;
+
+#ifdef SNORM
+		outputBytes.x =
+				packSnorm4x8(float4(maxVal * (1.0f / 255.0f) * 2.0f - 1.0f,
+						minVal * (1.0f / 255.0f) * 2.0f - 1.0f, 0.0f, 0.0f));
+#else
+		outputBytes.x = packUnorm4x8(
+				float4(maxVal * (1.0f / 255.0f), minVal * (1.0f / 255.0f), 0.0f, 0.0f));
+#endif
+
+		outputBytes.x |= g_mask[maskIdxBase].x;
+		outputBytes.y = g_mask[maskIdxBase].y;
+
+		uint2 dstUV = gl_GlobalInvocationID.yz;
+		imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
+	}
+}
--- a/engine/modules/betsy/bc6h.glsl
+++ b/engine/modules/betsy/bc6h.glsl
@ -0,0 +1,718 @@
+#[versions]
+
+signed = "#define SIGNED";
+unsigned = "#define QUALITY"; // The "Quality" preset causes artifacting on signed data, so for now it's exclusive to unsigned.
+
+#[compute]
+#version 450
+
+#include "CrossPlatformSettings_piece_all.glsl"
+
+#VERSION_DEFINES
+
+float3 f32tof16(float3 value) {
+	return float3(packHalf2x16(float2(value.x, 0.0)),
+			packHalf2x16(float2(value.y, 0.0)),
+			packHalf2x16(float2(value.z, 0.0)));
+}
+
+float3 f16tof32(uint3 value) {
+	return float3(unpackHalf2x16(value.x).x,
+			unpackHalf2x16(value.y).x,
+			unpackHalf2x16(value.z).x);
+}
+
+float f32tof16(float value) {
+	return packHalf2x16(float2(value.x, 0.0));
+}
+
+float f16tof32(uint value) {
+	return unpackHalf2x16(value.x).x;
+}
+
+layout(binding = 0) uniform sampler2D srcTexture;
+layout(binding = 1, rgba32ui) uniform restrict writeonly uimage2D dstTexture;
+
+layout(push_constant, std430) uniform Params {
+	float2 p_textureSizeRcp;
+	uint padding0;
+	uint padding1;
+}
+params;
+
+const float HALF_MAX = 65504.0f;
+const uint PATTERN_NUM = 32u;
+
+#ifdef SIGNED
+const float HALF_MIN = -65504.0f;
+#else
+const float HALF_MIN = 0.0f;
+#endif
+
+#ifdef SIGNED
+// https://github.com/godotengine/godot/pull/96377#issuecomment-2323488254
+// https://github.com/godotengine/godot/pull/96377#issuecomment-2323450950
+bool isNegative(float a) {
+	return a < 0.0f;
+}
+
+float CalcSignlessMSLE(float a, float b) {
+	float err = log2((b + 1.0f) / (a + 1.0f));
+	err = err * err;
+	return err;
+}
+
+float CrossCalcMSLE(float a, float b) {
+	float result = 0.0f;
+	result += CalcSignlessMSLE(0.0f, abs(a));
+	result += CalcSignlessMSLE(0.0f, abs(b));
+	return result;
+}
+
+float CalcMSLE(float3 a, float3 b) {
+	float result = 0.0f;
+	if (isNegative(a.x) != isNegative(b.x)) {
+		result += CrossCalcMSLE(a.x, b.x);
+	} else {
+		result += CalcSignlessMSLE(abs(a.x), abs(b.x));
+	}
+	if (isNegative(a.y) != isNegative(b.y)) {
+		result += CrossCalcMSLE(a.y, b.y);
+	} else {
+		result += CalcSignlessMSLE(abs(a.y), abs(b.y));
+	}
+	if (isNegative(a.z) != isNegative(b.z)) {
+		result += CrossCalcMSLE(a.z, b.z);
+	} else {
+		result += CalcSignlessMSLE(abs(a.z), abs(b.z));
+	}
+
+	return result;
+}
+#else
+float CalcMSLE(float3 a, float3 b) {
+	float3 err = log2((b + 1.0f) / (a + 1.0f));
+	err = err * err;
+	return err.x + err.y + err.z;
+}
+#endif
+
+uint PatternFixupID(uint i) {
+	uint ret = 15u;
+	ret = ((3441033216u >> i) & 0x1u) != 0 ? 2u : ret;
+	ret = ((845414400u >> i) & 0x1u) != 0 ? 8u : ret;
+	return ret;
+}
+
+uint Pattern(uint p, uint i) {
+	uint p2 = p / 2u;
+	uint p3 = p - p2 * 2u;
+
+	uint enc = 0u;
+	enc = p2 == 0u ? 2290666700u : enc;
+	enc = p2 == 1u ? 3972591342u : enc;
+	enc = p2 == 2u ? 4276930688u : enc;
+	enc = p2 == 3u ? 3967876808u : enc;
+	enc = p2 == 4u ? 4293707776u : enc;
+	enc = p2 == 5u ? 3892379264u : enc;
+	enc = p2 == 6u ? 4278255592u : enc;
+	enc = p2 == 7u ? 4026597360u : enc;
+	enc = p2 == 8u ? 9369360u : enc;
+	enc = p2 == 9u ? 147747072u : enc;
+	enc = p2 == 10u ? 1930428556u : enc;
+	enc = p2 == 11u ? 2362323200u : enc;
+	enc = p2 == 12u ? 823134348u : enc;
+	enc = p2 == 13u ? 913073766u : enc;
+	enc = p2 == 14u ? 267393000u : enc;
+	enc = p2 == 15u ? 966553998u : enc;
+
+	enc = p3 != 0u ? enc >> 16u : enc;
+	uint ret = (enc >> i) & 0x1u;
+	return ret;
+}
+
+#ifndef SIGNED
+//UF
+float3 Quantize7(float3 x) {
+	return (f32tof16(x) * 128.0f) / (0x7bff + 1.0f);
+}
+
+float3 Quantize9(float3 x) {
+	return (f32tof16(x) * 512.0f) / (0x7bff + 1.0f);
+}
+
+float3 Quantize10(float3 x) {
+	return (f32tof16(x) * 1024.0f) / (0x7bff + 1.0f);
+}
+
+float3 Unquantize7(float3 x) {
+	return (x * 65536.0f + 0x8000) / 128.0f;
+}
+
+float3 Unquantize9(float3 x) {
+	return (x * 65536.0f + 0x8000) / 512.0f;
+}
+
+float3 Unquantize10(float3 x) {
+	return (x * 65536.0f + 0x8000) / 1024.0f;
+}
+
+float3 FinishUnquantize(float3 endpoint0Unq, float3 endpoint1Unq, float weight) {
+	float3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 4096.0f);
+	return f16tof32(uint3(comp));
+}
+#else
+//SF
+
+float3 cmpSign(float3 value) {
+	float3 signVal;
+	signVal.x = value.x >= 0.0f ? 1.0f : -1.0f;
+	signVal.y = value.y >= 0.0f ? 1.0f : -1.0f;
+	signVal.z = value.z >= 0.0f ? 1.0f : -1.0f;
+	return signVal;
+}
+
+float3 Quantize7(float3 x) {
+	float3 signVal = cmpSign(x);
+	return signVal * (f32tof16(abs(x)) * 64.0f) / (0x7bff + 1.0f);
+}
+
+float3 Quantize9(float3 x) {
+	float3 signVal = cmpSign(x);
+	return signVal * (f32tof16(abs(x)) * 256.0f) / (0x7bff + 1.0f);
+}
+
+float3 Quantize10(float3 x) {
+	float3 signVal = cmpSign(x);
+	return signVal * (f32tof16(abs(x)) * 512.0f) / (0x7bff + 1.0f);
+}
+
+float3 Unquantize7(float3 x) {
+	float3 signVal = sign(x);
+	x = abs(x);
+	float3 finalVal = signVal * (x * 32768.0f + 0x4000) / 64.0f;
+	finalVal.x = x.x >= 64.0f ? 32767.0 : finalVal.x;
+	finalVal.y = x.y >= 64.0f ? 32767.0 : finalVal.y;
+	finalVal.z = x.z >= 64.0f ? 32767.0 : finalVal.z;
+	return finalVal;
+}
+
+float3 Unquantize9(float3 x) {
+	float3 signVal = sign(x);
+	x = abs(x);
+	float3 finalVal = signVal * (x * 32768.0f + 0x4000) / 256.0f;
+	finalVal.x = x.x >= 256.0f ? 32767.0 : finalVal.x;
+	finalVal.y = x.y >= 256.0f ? 32767.0 : finalVal.y;
+	finalVal.z = x.z >= 256.0f ? 32767.0 : finalVal.z;
+	return finalVal;
+}
+
+float3 Unquantize10(float3 x) {
+	float3 signVal = sign(x);
+	x = abs(x);
+	float3 finalVal = signVal * (x * 32768.0f + 0x4000) / 512.0f;
+	finalVal.x = x.x >= 512.0f ? 32767.0 : finalVal.x;
+	finalVal.y = x.y >= 512.0f ? 32767.0 : finalVal.y;
+	finalVal.z = x.z >= 512.0f ? 32767.0 : finalVal.z;
+	return finalVal;
+}
+
+float3 FinishUnquantize(float3 endpoint0Unq, float3 endpoint1Unq, float weight) {
+	float3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 2048.0f);
+	return f16tof32(uint3(comp));
+}
+#endif
+
+void Swap(inout float3 a, inout float3 b) {
+	float3 tmp = a;
+	a = b;
+	b = tmp;
+}
+
+void Swap(inout float a, inout float b) {
+	float tmp = a;
+	a = b;
+	b = tmp;
+}
+
+uint ComputeIndex3(float texelPos, float endPoint0Pos, float endPoint1Pos) {
+	float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos);
+	return uint(clamp(r * 6.98182f + 0.00909f + 0.5f, 0.0f, 7.0f));
+}
+
+uint ComputeIndex4(float texelPos, float endPoint0Pos, float endPoint1Pos) {
+	float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos);
+	return uint(clamp(r * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f));
+}
+
+// This adds a bitflag to quantized values that signifies whether they are negative.
+void SignExtend(inout float3 v1, uint mask, uint signFlag) {
+	int3 v = int3(v1);
+	v.x = (v.x & int(mask)) | (v.x < 0 ? int(signFlag) : 0);
+	v.y = (v.y & int(mask)) | (v.y < 0 ? int(signFlag) : 0);
+	v.z = (v.z & int(mask)) | (v.z < 0 ? int(signFlag) : 0);
+	v1 = v;
+}
+
+// Encodes a block with mode 11 (2x 10-bit endpoints).
+void EncodeP1(inout uint4 block, inout float blockMSLE, float3 texels[16]) {
+	// compute endpoints (min/max RGB bbox)
+	float3 blockMin = texels[0];
+	float3 blockMax = texels[0];
+	for (uint i = 1u; i < 16u; ++i) {
+		blockMin = min(blockMin, texels[i]);
+		blockMax = max(blockMax, texels[i]);
+	}
+
+	// refine endpoints in log2 RGB space
+	float3 refinedBlockMin = blockMax;
+	float3 refinedBlockMax = blockMin;
+	for (uint i = 0u; i < 16u; ++i) {
+		refinedBlockMin = min(refinedBlockMin, texels[i] == blockMin ? refinedBlockMin : texels[i]);
+		refinedBlockMax = max(refinedBlockMax, texels[i] == blockMax ? refinedBlockMax : texels[i]);
+	}
+
+	float3 logBlockMax = log2(blockMax + 1.0f);
+	float3 logBlockMin = log2(blockMin + 1.0f);
+	float3 logRefinedBlockMax = log2(refinedBlockMax + 1.0f);
+	float3 logRefinedBlockMin = log2(refinedBlockMin + 1.0f);
+	float3 logBlockMaxExt = (logBlockMax - logBlockMin) * (1.0f / 32.0f);
+	logBlockMin += min(logRefinedBlockMin - logBlockMin, logBlockMaxExt);
+	logBlockMax -= min(logBlockMax - logRefinedBlockMax, logBlockMaxExt);
+	blockMin = exp2(logBlockMin) - 1.0f;
+	blockMax = exp2(logBlockMax) - 1.0f;
+
+	float3 blockDir = blockMax - blockMin;
+	blockDir = blockDir / (blockDir.x + blockDir.y + blockDir.z);
+
+	float3 endpoint0 = Quantize10(blockMin);
+	float3 endpoint1 = Quantize10(blockMax);
+	float endPoint0Pos = f32tof16(dot(blockMin, blockDir));
+	float endPoint1Pos = f32tof16(dot(blockMax, blockDir));
+
+#ifdef SIGNED
+	int maxVal10 = 0x1FF;
+	endpoint0 = clamp(endpoint0, -maxVal10, maxVal10);
+	endpoint1 = clamp(endpoint1, -maxVal10, maxVal10);
+#endif
+
+	// check if endpoint swap is required
+	float fixupTexelPos = f32tof16(dot(texels[0], blockDir));
+	uint fixupIndex = ComputeIndex4(fixupTexelPos, endPoint0Pos, endPoint1Pos);
+	if (fixupIndex > 7) {
+		Swap(endPoint0Pos, endPoint1Pos);
+		Swap(endpoint0, endpoint1);
+	}
+
+	// compute indices
+	uint indices[16] = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u };
+	for (uint i = 0u; i < 16u; ++i) {
+		float texelPos = f32tof16(dot(texels[i], blockDir));
+		indices[i] = ComputeIndex4(texelPos, endPoint0Pos, endPoint1Pos);
+	}
+
+	// compute compression error (MSLE)
+	float3 endpoint0Unq = Unquantize10(endpoint0);
+	float3 endpoint1Unq = Unquantize10(endpoint1);
+	float msle = 0.0f;
+	for (uint i = 0u; i < 16u; ++i) {
+		float weight = floor((indices[i] * 64.0f) / 15.0f + 0.5f);
+		float3 texelUnc = FinishUnquantize(endpoint0Unq, endpoint1Unq, weight);
+
+		msle += CalcMSLE(texels[i], texelUnc);
+	}
+
+#ifdef SIGNED
+	SignExtend(endpoint0, 0x1FF, 0x200);
+	SignExtend(endpoint1, 0x1FF, 0x200);
+#endif
+
+	// encode block for mode 11
+	blockMSLE = msle;
+	block.x = 0x03;
+
+	// endpoints
+	block.x |= uint(endpoint0.x) << 5u;
+	block.x |= uint(endpoint0.y) << 15u;
+	block.x |= uint(endpoint0.z) << 25u;
+	block.y |= uint(endpoint0.z) >> 7u;
+	block.y |= uint(endpoint1.x) << 3u;
+	block.y |= uint(endpoint1.y) << 13u;
+	block.y |= uint(endpoint1.z) << 23u;
+	block.z |= uint(endpoint1.z) >> 9u;
+
+	// indices
+	block.z |= indices[0] << 1u;
+	block.z |= indices[1] << 4u;
+	block.z |= indices[2] << 8u;
+	block.z |= indices[3] << 12u;
+	block.z |= indices[4] << 16u;
+	block.z |= indices[5] << 20u;
+	block.z |= indices[6] << 24u;
+	block.z |= indices[7] << 28u;
+	block.w |= indices[8] << 0u;
+	block.w |= indices[9] << 4u;
+	block.w |= indices[10] << 8u;
+	block.w |= indices[11] << 12u;
+	block.w |= indices[12] << 16u;
+	block.w |= indices[13] << 20u;
+	block.w |= indices[14] << 24u;
+	block.w |= indices[15] << 28u;
+}
+
+float DistToLineSq(float3 PointOnLine, float3 LineDirection, float3 Point) {
+	float3 w = Point - PointOnLine;
+	float3 x = w - dot(w, LineDirection) * LineDirection;
+
+	return dot(x, x);
+}
+
+// Gets the deviation from the source data of a particular pattern (smaller is better).
+float EvaluateP2Pattern(uint pattern, float3 texels[16]) {
+	float3 p0BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
+	float3 p0BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
+	float3 p1BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
+	float3 p1BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
+
+	for (uint i = 0; i < 16; ++i) {
+		uint paletteID = Pattern(pattern, i);
+		if (paletteID == 0) {
+			p0BlockMin = min(p0BlockMin, texels[i]);
+			p0BlockMax = max(p0BlockMax, texels[i]);
+		} else {
+			p1BlockMin = min(p1BlockMin, texels[i]);
+			p1BlockMax = max(p1BlockMax, texels[i]);
+		}
+	}
+
+	float3 p0BlockDir = normalize(p0BlockMax - p0BlockMin);
+	float3 p1BlockDir = normalize(p1BlockMax - p1BlockMin);
+
+	float sqDistanceFromLine = 0.0f;
+
+	for (uint i = 0; i < 16; ++i) {
+		uint paletteID = Pattern(pattern, i);
+		if (paletteID == 0) {
+			sqDistanceFromLine += DistToLineSq(p0BlockMin, p0BlockDir, texels[i]);
+		} else {
+			sqDistanceFromLine += DistToLineSq(p1BlockMin, p1BlockDir, texels[i]);
+		}
+	}
+
+	return sqDistanceFromLine;
+}
+
+// Encodes a block with either mode 2 (7-bit base, 3x 6-bit delta), or mode 6 (9-bit base, 3x 5-bit delta). Both use pattern encoding.
+void EncodeP2Pattern(inout uint4 block, inout float blockMSLE, uint pattern, float3 texels[16]) {
+	float3 p0BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
+	float3 p0BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
+	float3 p1BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
+	float3 p1BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
+
+	for (uint i = 0u; i < 16u; ++i) {
+		uint paletteID = Pattern(pattern, i);
+		if (paletteID == 0) {
+			p0BlockMin = min(p0BlockMin, texels[i]);
+			p0BlockMax = max(p0BlockMax, texels[i]);
+		} else {
+			p1BlockMin = min(p1BlockMin, texels[i]);
+			p1BlockMax = max(p1BlockMax, texels[i]);
+		}
+	}
+
+	float3 p0BlockDir = p0BlockMax - p0BlockMin;
+	float3 p1BlockDir = p1BlockMax - p1BlockMin;
+	p0BlockDir = p0BlockDir / (p0BlockDir.x + p0BlockDir.y + p0BlockDir.z);
+	p1BlockDir = p1BlockDir / (p1BlockDir.x + p1BlockDir.y + p1BlockDir.z);
+
+	float p0Endpoint0Pos = f32tof16(dot(p0BlockMin, p0BlockDir));
+	float p0Endpoint1Pos = f32tof16(dot(p0BlockMax, p0BlockDir));
+	float p1Endpoint0Pos = f32tof16(dot(p1BlockMin, p1BlockDir));
+	float p1Endpoint1Pos = f32tof16(dot(p1BlockMax, p1BlockDir));
+
+	uint fixupID = PatternFixupID(pattern);
+	float p0FixupTexelPos = f32tof16(dot(texels[0], p0BlockDir));
+	float p1FixupTexelPos = f32tof16(dot(texels[fixupID], p1BlockDir));
+	uint p0FixupIndex = ComputeIndex3(p0FixupTexelPos, p0Endpoint0Pos, p0Endpoint1Pos);
+	uint p1FixupIndex = ComputeIndex3(p1FixupTexelPos, p1Endpoint0Pos, p1Endpoint1Pos);
+	if (p0FixupIndex > 3u) {
+		Swap(p0Endpoint0Pos, p0Endpoint1Pos);
+		Swap(p0BlockMin, p0BlockMax);
+	}
+	if (p1FixupIndex > 3u) {
+		Swap(p1Endpoint0Pos, p1Endpoint1Pos);
+		Swap(p1BlockMin, p1BlockMax);
+	}
+
+	uint indices[16] = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u };
+	for (uint i = 0u; i < 16u; ++i) {
+		float p0TexelPos = f32tof16(dot(texels[i], p0BlockDir));
+		float p1TexelPos = f32tof16(dot(texels[i], p1BlockDir));
+		uint p0Index = ComputeIndex3(p0TexelPos, p0Endpoint0Pos, p0Endpoint1Pos);
+		uint p1Index = ComputeIndex3(p1TexelPos, p1Endpoint0Pos, p1Endpoint1Pos);
+
+		uint paletteID = Pattern(pattern, i);
+		indices[i] = paletteID == 0u ? p0Index : p1Index;
+	}
+
+	float3 endpoint760 = floor(Quantize7(p0BlockMin));
+	float3 endpoint761 = floor(Quantize7(p0BlockMax));
+	float3 endpoint762 = floor(Quantize7(p1BlockMin));
+	float3 endpoint763 = floor(Quantize7(p1BlockMax));
+
+	float3 endpoint950 = floor(Quantize9(p0BlockMin));
+	float3 endpoint951 = floor(Quantize9(p0BlockMax));
+	float3 endpoint952 = floor(Quantize9(p1BlockMin));
+	float3 endpoint953 = floor(Quantize9(p1BlockMax));
+
+	endpoint761 = endpoint761 - endpoint760;
+	endpoint762 = endpoint762 - endpoint760;
+	endpoint763 = endpoint763 - endpoint760;
+
+	endpoint951 = endpoint951 - endpoint950;
+	endpoint952 = endpoint952 - endpoint950;
+	endpoint953 = endpoint953 - endpoint950;
+
+	int maxVal76 = 0x1F;
+	endpoint761 = clamp(endpoint761, -maxVal76, maxVal76);
+	endpoint762 = clamp(endpoint762, -maxVal76, maxVal76);
+	endpoint763 = clamp(endpoint763, -maxVal76, maxVal76);
+
+	int maxVal95 = 0xF;
+	endpoint951 = clamp(endpoint951, -maxVal95, maxVal95);
+	endpoint952 = clamp(endpoint952, -maxVal95, maxVal95);
+	endpoint953 = clamp(endpoint953, -maxVal95, maxVal95);
+
+#ifdef SIGNED
+	int maxVal7 = 0x3F;
+	int maxVal9 = 0xFF;
+	endpoint760 = clamp(endpoint760, -maxVal7, maxVal7);
+	endpoint950 = clamp(endpoint950, -maxVal9, maxVal9);
+#endif
+
+	float3 endpoint760Unq = Unquantize7(endpoint760);
+	float3 endpoint761Unq = Unquantize7(endpoint760 + endpoint761);
+	float3 endpoint762Unq = Unquantize7(endpoint760 + endpoint762);
+	float3 endpoint763Unq = Unquantize7(endpoint760 + endpoint763);
+	float3 endpoint950Unq = Unquantize9(endpoint950);
+	float3 endpoint951Unq = Unquantize9(endpoint950 + endpoint951);
+	float3 endpoint952Unq = Unquantize9(endpoint950 + endpoint952);
+	float3 endpoint953Unq = Unquantize9(endpoint950 + endpoint953);
+
+	float msle76 = 0.0f;
+	float msle95 = 0.0f;
+	for (uint i = 0u; i < 16u; ++i) {
+		uint paletteID = Pattern(pattern, i);
+
+		float3 tmp760Unq = paletteID == 0u ? endpoint760Unq : endpoint762Unq;
+		float3 tmp761Unq = paletteID == 0u ? endpoint761Unq : endpoint763Unq;
+		float3 tmp950Unq = paletteID == 0u ? endpoint950Unq : endpoint952Unq;
+		float3 tmp951Unq = paletteID == 0u ? endpoint951Unq : endpoint953Unq;
+
+		float weight = floor((indices[i] * 64.0f) / 7.0f + 0.5f);
+		float3 texelUnc76 = FinishUnquantize(tmp760Unq, tmp761Unq, weight);
+		float3 texelUnc95 = FinishUnquantize(tmp950Unq, tmp951Unq, weight);
+
+		msle76 += CalcMSLE(texels[i], texelUnc76);
+		msle95 += CalcMSLE(texels[i], texelUnc95);
+	}
+
+	SignExtend(endpoint761, 0x1F, 0x20);
+	SignExtend(endpoint762, 0x1F, 0x20);
+	SignExtend(endpoint763, 0x1F, 0x20);
+
+	SignExtend(endpoint951, 0xF, 0x10);
+	SignExtend(endpoint952, 0xF, 0x10);
+	SignExtend(endpoint953, 0xF, 0x10);
+
+#ifdef SIGNED
+	SignExtend(endpoint760, 0x3F, 0x40);
+	SignExtend(endpoint950, 0xFF, 0x100);
+#endif
+
+	// encode block
+	float p2MSLE = min(msle76, msle95);
+	if (p2MSLE < blockMSLE) {
+		blockMSLE = p2MSLE;
+		block = uint4(0u, 0u, 0u, 0u);
+
+		if (p2MSLE == msle76) {
+			// 7.6
+			block.x = 0x1u;
+			block.x |= (uint(endpoint762.y) & 0x20u) >> 3u;
+			block.x |= (uint(endpoint763.y) & 0x10u) >> 1u;
+			block.x |= (uint(endpoint763.y) & 0x20u) >> 1u;
+			block.x |= uint(endpoint760.x) << 5u;
+			block.x |= (uint(endpoint763.z) & 0x01u) << 12u;
+			block.x |= (uint(endpoint763.z) & 0x02u) << 12u;
+			block.x |= (uint(endpoint762.z) & 0x10u) << 10u;
+			block.x |= uint(endpoint760.y) << 15u;
+			block.x |= (uint(endpoint762.z) & 0x20u) << 17u;
+			block.x |= (uint(endpoint763.z) & 0x04u) << 21u;
+			block.x |= (uint(endpoint762.y) & 0x10u) << 20u;
+			block.x |= uint(endpoint760.z) << 25u;
+			block.y |= (uint(endpoint763.z) & 0x08u) >> 3u;
+			block.y |= (uint(endpoint763.z) & 0x20u) >> 4u;
+			block.y |= (uint(endpoint763.z) & 0x10u) >> 2u;
+			block.y |= uint(endpoint761.x) << 3u;
+			block.y |= (uint(endpoint762.y) & 0x0Fu) << 9u;
+			block.y |= uint(endpoint761.y) << 13u;
+			block.y |= (uint(endpoint763.y) & 0x0Fu) << 19u;
+			block.y |= uint(endpoint761.z) << 23u;
+			block.y |= (uint(endpoint762.z) & 0x07u) << 29u;
+			block.z |= (uint(endpoint762.z) & 0x08u) >> 3u;
+			block.z |= uint(endpoint762.x) << 1u;
+			block.z |= uint(endpoint763.x) << 7u;
+		} else {
+			// 9.5
+			block.x = 0xEu;
+			block.x |= uint(endpoint950.x) << 5u;
+			block.x |= (uint(endpoint952.z) & 0x10u) << 10u;
+			block.x |= uint(endpoint950.y) << 15u;
+			block.x |= (uint(endpoint952.y) & 0x10u) << 20u;
+			block.x |= uint(endpoint950.z) << 25u;
+			block.y |= uint(endpoint950.z) >> 7u;
+			block.y |= (uint(endpoint953.z) & 0x10u) >> 2u;
+			block.y |= uint(endpoint951.x) << 3u;
+			block.y |= (uint(endpoint953.y) & 0x10u) << 4u;
+			block.y |= (uint(endpoint952.y) & 0x0Fu) << 9u;
+			block.y |= uint(endpoint951.y) << 13u;
+			block.y |= (uint(endpoint953.z) & 0x01u) << 18u;
+			block.y |= (uint(endpoint953.y) & 0x0Fu) << 19u;
+			block.y |= uint(endpoint951.z) << 23u;
+			block.y |= (uint(endpoint953.z) & 0x02u) << 27u;
+			block.y |= uint(endpoint952.z) << 29u;
+			block.z |= (uint(endpoint952.z) & 0x08u) >> 3u;
+			block.z |= uint(endpoint952.x) << 1u;
+			block.z |= (uint(endpoint953.z) & 0x04u) << 4u;
+			block.z |= uint(endpoint953.x) << 7u;
+			block.z |= (uint(endpoint953.z) & 0x08u) << 9u;
+		}
+
+		block.z |= pattern << 13u;
+		uint blockFixupID = PatternFixupID(pattern);
+		if (blockFixupID == 15u) {
+			block.z |= indices[0] << 18u;
+			block.z |= indices[1] << 20u;
+			block.z |= indices[2] << 23u;
+			block.z |= indices[3] << 26u;
+			block.z |= indices[4] << 29u;
+			block.w |= indices[5] << 0u;
+			block.w |= indices[6] << 3u;
+			block.w |= indices[7] << 6u;
+			block.w |= indices[8] << 9u;
+			block.w |= indices[9] << 12u;
+			block.w |= indices[10] << 15u;
+			block.w |= indices[11] << 18u;
+			block.w |= indices[12] << 21u;
+			block.w |= indices[13] << 24u;
+			block.w |= indices[14] << 27u;
+			block.w |= indices[15] << 30u;
+		} else if (blockFixupID == 2u) {
+			block.z |= indices[0] << 18u;
+			block.z |= indices[1] << 20u;
+			block.z |= indices[2] << 23u;
+			block.z |= indices[3] << 25u;
+			block.z |= indices[4] << 28u;
+			block.z |= indices[5] << 31u;
+			block.w |= indices[5] >> 1u;
+			block.w |= indices[6] << 2u;
+			block.w |= indices[7] << 5u;
+			block.w |= indices[8] << 8u;
+			block.w |= indices[9] << 11u;
+			block.w |= indices[10] << 14u;
+			block.w |= indices[11] << 17u;
+			block.w |= indices[12] << 20u;
+			block.w |= indices[13] << 23u;
+			block.w |= indices[14] << 26u;
+			block.w |= indices[15] << 29u;
+		} else {
+			block.z |= indices[0] << 18u;
+			block.z |= indices[1] << 20u;
+			block.z |= indices[2] << 23u;
+			block.z |= indices[3] << 26u;
+			block.z |= indices[4] << 29u;
+			block.w |= indices[5] << 0u;
+			block.w |= indices[6] << 3u;
+			block.w |= indices[7] << 6u;
+			block.w |= indices[8] << 9u;
+			block.w |= indices[9] << 11u;
+			block.w |= indices[10] << 14u;
+			block.w |= indices[11] << 17u;
+			block.w |= indices[12] << 20u;
+			block.w |= indices[13] << 23u;
+			block.w |= indices[14] << 26u;
+			block.w |= indices[15] << 29u;
+		}
+	}
+}
+
+layout(local_size_x = 8,
+		local_size_y = 8,
+		local_size_z = 1) in;
+
+void main() {
+	// gather texels for current 4x4 block
+	// 0 1 2 3
+	// 4 5 6 7
+	// 8 9 10 11
+	// 12 13 14 15
+	float2 uv = gl_GlobalInvocationID.xy * params.p_textureSizeRcp * 4.0f + params.p_textureSizeRcp;
+	float2 block0UV = uv;
+	float2 block1UV = uv + float2(2.0f * params.p_textureSizeRcp.x, 0.0f);
+	float2 block2UV = uv + float2(0.0f, 2.0f * params.p_textureSizeRcp.y);
+	float2 block3UV = uv + float2(2.0f * params.p_textureSizeRcp.x, 2.0f * params.p_textureSizeRcp.y);
+	float4 block0X = OGRE_GatherRed(srcTexture, pointSampler, block0UV);
+	float4 block1X = OGRE_GatherRed(srcTexture, pointSampler, block1UV);
+	float4 block2X = OGRE_GatherRed(srcTexture, pointSampler, block2UV);
+	float4 block3X = OGRE_GatherRed(srcTexture, pointSampler, block3UV);
+	float4 block0Y = OGRE_GatherGreen(srcTexture, pointSampler, block0UV);
+	float4 block1Y = OGRE_GatherGreen(srcTexture, pointSampler, block1UV);
+	float4 block2Y = OGRE_GatherGreen(srcTexture, pointSampler, block2UV);
+	float4 block3Y = OGRE_GatherGreen(srcTexture, pointSampler, block3UV);
+	float4 block0Z = OGRE_GatherBlue(srcTexture, pointSampler, block0UV);
+	float4 block1Z = OGRE_GatherBlue(srcTexture, pointSampler, block1UV);
+	float4 block2Z = OGRE_GatherBlue(srcTexture, pointSampler, block2UV);
+	float4 block3Z = OGRE_GatherBlue(srcTexture, pointSampler, block3UV);
+
+	float3 texels[16];
+	texels[0] = float3(block0X.w, block0Y.w, block0Z.w);
+	texels[1] = float3(block0X.z, block0Y.z, block0Z.z);
+	texels[2] = float3(block1X.w, block1Y.w, block1Z.w);
+	texels[3] = float3(block1X.z, block1Y.z, block1Z.z);
+	texels[4] = float3(block0X.x, block0Y.x, block0Z.x);
+	texels[5] = float3(block0X.y, block0Y.y, block0Z.y);
+	texels[6] = float3(block1X.x, block1Y.x, block1Z.x);
+	texels[7] = float3(block1X.y, block1Y.y, block1Z.y);
+	texels[8] = float3(block2X.w, block2Y.w, block2Z.w);
+	texels[9] = float3(block2X.z, block2Y.z, block2Z.z);
+	texels[10] = float3(block3X.w, block3Y.w, block3Z.w);
+	texels[11] = float3(block3X.z, block3Y.z, block3Z.z);
+	texels[12] = float3(block2X.x, block2Y.x, block2Z.x);
+	texels[13] = float3(block2X.y, block2Y.y, block2Z.y);
+	texels[14] = float3(block3X.x, block3Y.x, block3Z.x);
+	texels[15] = float3(block3X.y, block3Y.y, block3Z.y);
+
+	uint4 block = uint4(0u, 0u, 0u, 0u);
+	float blockMSLE = 0.0f;
+
+	EncodeP1(block, blockMSLE, texels);
+
+#ifdef QUALITY
+	float bestScore = EvaluateP2Pattern(0, texels);
+	uint bestPattern = 0;
+
+	for (uint i = 1u; i < PATTERN_NUM; ++i) {
+		float score = EvaluateP2Pattern(i, texels);
+
+		if (score < bestScore) {
+			bestPattern = i;
+			bestScore = score;
+		}
+	}
+
+	EncodeP2Pattern(block, blockMSLE, bestPattern, texels);
+#endif
+
+	imageStore(dstTexture, int2(gl_GlobalInvocationID.xy), block);
+}
--- a/engine/modules/betsy/betsy_bc1.h
+++ b/engine/modules/betsy/betsy_bc1.h
--- a/engine/modules/betsy/config.py
+++ b/engine/modules/betsy/config.py
@ -0,0 +1,6 @@
+def can_build(env, platform):
+    return env.editor_build
+
+
+def configure(env):
+    pass
--- a/engine/modules/betsy/image_compress_betsy.cpp
+++ b/engine/modules/betsy/image_compress_betsy.cpp
@ -0,0 +1,742 @@
+/**************************************************************************/
+/*  image_compress_betsy.cpp                                              */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "image_compress_betsy.h"
+
+#include "core/config/project_settings.h"
+
+#include "betsy_bc1.h"
+
+#include "alpha_stitch.glsl.gen.h"
+#include "bc1.glsl.gen.h"
+#include "bc4.glsl.gen.h"
+#include "bc6h.glsl.gen.h"
+#include "servers/display_server.h"
+
+static Mutex betsy_mutex;
+static BetsyCompressor *betsy = nullptr;
+
+static const BetsyShaderType FORMAT_TO_TYPE[BETSY_FORMAT_MAX] = {
+	BETSY_SHADER_BC1_STANDARD,
+	BETSY_SHADER_BC1_DITHER,
+	BETSY_SHADER_BC1_STANDARD,
+	BETSY_SHADER_BC4_SIGNED,
+	BETSY_SHADER_BC4_UNSIGNED,
+	BETSY_SHADER_BC4_SIGNED,
+	BETSY_SHADER_BC4_UNSIGNED,
+	BETSY_SHADER_BC6_SIGNED,
+	BETSY_SHADER_BC6_UNSIGNED,
+};
+
+static const RD::DataFormat BETSY_TO_RD_FORMAT[BETSY_FORMAT_MAX] = {
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32B32A32_UINT,
+	RD::DATA_FORMAT_R32G32B32A32_UINT,
+};
+
+static const Image::Format BETSY_TO_IMAGE_FORMAT[BETSY_FORMAT_MAX] = {
+	Image::FORMAT_DXT1,
+	Image::FORMAT_DXT1,
+	Image::FORMAT_DXT5,
+	Image::FORMAT_RGTC_R,
+	Image::FORMAT_RGTC_R,
+	Image::FORMAT_RGTC_RG,
+	Image::FORMAT_RGTC_RG,
+	Image::FORMAT_BPTC_RGBF,
+	Image::FORMAT_BPTC_RGBFU,
+};
+
+void BetsyCompressor::_init() {
+	if (!DisplayServer::can_create_rendering_device()) {
+		return;
+	}
+
+	// Create local RD.
+	RenderingContextDriver *rcd = nullptr;
+	RenderingDevice *rd = RenderingServer::get_singleton()->create_local_rendering_device();
+
+	if (rd == nullptr) {
+#if defined(RD_ENABLED)
+#if defined(METAL_ENABLED)
+		rcd = memnew(RenderingContextDriverMetal);
+		rd = memnew(RenderingDevice);
+#endif
+#if defined(VULKAN_ENABLED)
+		if (rcd == nullptr) {
+			rcd = memnew(RenderingContextDriverVulkan);
+			rd = memnew(RenderingDevice);
+		}
+#endif
+#endif
+		if (rcd != nullptr && rd != nullptr) {
+			Error err = rcd->initialize();
+			if (err == OK) {
+				err = rd->initialize(rcd);
+			}
+
+			if (err != OK) {
+				memdelete(rd);
+				memdelete(rcd);
+				rd = nullptr;
+				rcd = nullptr;
+			}
+		}
+	}
+
+	ERR_FAIL_NULL_MSG(rd, "Unable to create a local RenderingDevice.");
+
+	compress_rd = rd;
+	compress_rcd = rcd;
+
+	// Create the sampler state.
+	RD::SamplerState src_sampler_state;
+	{
+		src_sampler_state.repeat_u = RD::SAMPLER_REPEAT_MODE_CLAMP_TO_EDGE;
+		src_sampler_state.repeat_v = RD::SAMPLER_REPEAT_MODE_CLAMP_TO_EDGE;
+		src_sampler_state.mag_filter = RD::SAMPLER_FILTER_NEAREST;
+		src_sampler_state.min_filter = RD::SAMPLER_FILTER_NEAREST;
+		src_sampler_state.mip_filter = RD::SAMPLER_FILTER_NEAREST;
+	}
+
+	src_sampler = compress_rd->sampler_create(src_sampler_state);
+
+	// Initialize RDShaderFiles.
+	{
+		Ref<RDShaderFile> bc1_shader;
+		bc1_shader.instantiate();
+		Error err = bc1_shader->parse_versions_from_text(bc1_shader_glsl);
+
+		if (err != OK) {
+			bc1_shader->print_errors("Betsy BC1 compress shader");
+		}
+
+		// Standard BC1 compression.
+		cached_shaders[BETSY_SHADER_BC1_STANDARD].compiled = compress_rd->shader_create_from_spirv(bc1_shader->get_spirv_stages("standard"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_STANDARD].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC1_STANDARD].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC1_STANDARD].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_STANDARD].pipeline.is_null());
+
+		// Dither BC1 variant. Unused, so comment out for now.
+		//cached_shaders[BETSY_SHADER_BC1_DITHER].compiled = compress_rd->shader_create_from_spirv(bc1_shader->get_spirv_stages("dithered"));
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_DITHER].compiled.is_null());
+
+		//cached_shaders[BETSY_SHADER_BC1_DITHER].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC1_DITHER].compiled);
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_DITHER].pipeline.is_null());
+	}
+
+	{
+		Ref<RDShaderFile> bc4_shader;
+		bc4_shader.instantiate();
+		Error err = bc4_shader->parse_versions_from_text(bc4_shader_glsl);
+
+		if (err != OK) {
+			bc4_shader->print_errors("Betsy BC4 compress shader");
+		}
+
+		// Signed BC4 compression. Unused, so comment out for now.
+		//cached_shaders[BETSY_SHADER_BC4_SIGNED].compiled = compress_rd->shader_create_from_spirv(bc4_shader->get_spirv_stages("signed"));
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_SIGNED].compiled.is_null());
+
+		//cached_shaders[BETSY_SHADER_BC4_SIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC4_SIGNED].compiled);
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_SIGNED].pipeline.is_null());
+
+		// Unsigned BC4 compression.
+		cached_shaders[BETSY_SHADER_BC4_UNSIGNED].compiled = compress_rd->shader_create_from_spirv(bc4_shader->get_spirv_stages("unsigned"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_UNSIGNED].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC4_UNSIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC4_UNSIGNED].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_UNSIGNED].pipeline.is_null());
+	}
+
+	{
+		Ref<RDShaderFile> bc6h_shader;
+		bc6h_shader.instantiate();
+		Error err = bc6h_shader->parse_versions_from_text(bc6h_shader_glsl);
+
+		if (err != OK) {
+			bc6h_shader->print_errors("Betsy BC6 compress shader");
+		}
+
+		// Signed BC6 compression.
+		cached_shaders[BETSY_SHADER_BC6_SIGNED].compiled = compress_rd->shader_create_from_spirv(bc6h_shader->get_spirv_stages("signed"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_SIGNED].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC6_SIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC6_SIGNED].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_SIGNED].pipeline.is_null());
+
+		// Unsigned BC6 compression.
+		cached_shaders[BETSY_SHADER_BC6_UNSIGNED].compiled = compress_rd->shader_create_from_spirv(bc6h_shader->get_spirv_stages("unsigned"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_UNSIGNED].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC6_UNSIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC6_UNSIGNED].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_UNSIGNED].pipeline.is_null());
+	}
+
+	{
+		Ref<RDShaderFile> alpha_stitch_shader;
+		alpha_stitch_shader.instantiate();
+		Error err = alpha_stitch_shader->parse_versions_from_text(alpha_stitch_shader_glsl);
+
+		if (err != OK) {
+			alpha_stitch_shader->print_errors("Betsy alpha stitch shader");
+		}
+		cached_shaders[BETSY_SHADER_ALPHA_STITCH].compiled = compress_rd->shader_create_from_spirv(alpha_stitch_shader->get_spirv_stages());
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_ALPHA_STITCH].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_ALPHA_STITCH].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_ALPHA_STITCH].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_ALPHA_STITCH].pipeline.is_null());
+	}
+}
+
+void BetsyCompressor::init() {
+	WorkerThreadPool::TaskID tid = WorkerThreadPool::get_singleton()->add_task(callable_mp(this, &BetsyCompressor::_thread_loop), true);
+	command_queue.set_pump_task_id(tid);
+	command_queue.push(this, &BetsyCompressor::_assign_mt_ids, tid);
+	command_queue.push_and_sync(this, &BetsyCompressor::_init);
+	DEV_ASSERT(task_id == tid);
+}
+
+void BetsyCompressor::_assign_mt_ids(WorkerThreadPool::TaskID p_pump_task_id) {
+	task_id = p_pump_task_id;
+}
+
+// Yield thread to WTP so other tasks can be done on it.
+// Automatically regains control as soon a task is pushed to the command queue.
+void BetsyCompressor::_thread_loop() {
+	while (!exit) {
+		WorkerThreadPool::get_singleton()->yield();
+		command_queue.flush_all();
+	}
+}
+
+void BetsyCompressor::_thread_exit() {
+	exit = true;
+
+	if (compress_rd != nullptr) {
+		if (dxt1_encoding_table_buffer.is_valid()) {
+			compress_rd->free(dxt1_encoding_table_buffer);
+		}
+
+		compress_rd->free(src_sampler);
+
+		// Clear the shader cache, pipelines will be unreferenced automatically.
+		for (int i = 0; i < BETSY_SHADER_MAX; i++) {
+			if (cached_shaders[i].compiled.is_valid()) {
+				compress_rd->free(cached_shaders[i].compiled);
+			}
+		}
+	}
+}
+
+void BetsyCompressor::finish() {
+	command_queue.push(this, &BetsyCompressor::_thread_exit);
+	if (task_id != WorkerThreadPool::INVALID_TASK_ID) {
+		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
+		task_id = WorkerThreadPool::INVALID_TASK_ID;
+	}
+
+	if (compress_rd != nullptr) {
+		// Free the RD (and RCD if necessary).
+		memdelete(compress_rd);
+		compress_rd = nullptr;
+		if (compress_rcd != nullptr) {
+			memdelete(compress_rcd);
+			compress_rcd = nullptr;
+		}
+	}
+}
+
+// Helper functions.
+
+static int get_next_multiple(int n, int m) {
+	return n + (m - (n % m));
+}
+
+static Error get_src_texture_format(Image *r_img, RD::DataFormat &r_format) {
+	switch (r_img->get_format()) {
+		case Image::FORMAT_L8:
+			r_img->convert(Image::FORMAT_RGBA8);
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_LA8:
+			r_img->convert(Image::FORMAT_RGBA8);
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_R8:
+			r_format = RD::DATA_FORMAT_R8_UNORM;
+			break;
+
+		case Image::FORMAT_RG8:
+			r_format = RD::DATA_FORMAT_R8G8_UNORM;
+			break;
+
+		case Image::FORMAT_RGB8:
+			r_img->convert(Image::FORMAT_RGBA8);
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_RGBA8:
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_RH:
+			r_format = RD::DATA_FORMAT_R16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGH:
+			r_format = RD::DATA_FORMAT_R16G16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBH:
+			r_img->convert(Image::FORMAT_RGBAH);
+			r_format = RD::DATA_FORMAT_R16G16B16A16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBAH:
+			r_format = RD::DATA_FORMAT_R16G16B16A16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RF:
+			r_format = RD::DATA_FORMAT_R32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGF:
+			r_format = RD::DATA_FORMAT_R32G32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBF:
+			r_img->convert(Image::FORMAT_RGBAF);
+			r_format = RD::DATA_FORMAT_R32G32B32A32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBAF:
+			r_format = RD::DATA_FORMAT_R32G32B32A32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBE9995:
+			r_format = RD::DATA_FORMAT_E5B9G9R9_UFLOAT_PACK32;
+			break;
+
+		default: {
+			return ERR_UNAVAILABLE;
+		}
+	}
+
+	return OK;
+}
+
+Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
+	// Return an error so that the compression can fall back to cpu compression
+	if (compress_rd == nullptr) {
+		return ERR_CANT_CREATE;
+	}
+
+	if (r_img->is_compressed()) {
+		return ERR_INVALID_DATA;
+	}
+
+	Error err = OK;
+
+	// Destination format.
+	Image::Format dest_format = BETSY_TO_IMAGE_FORMAT[p_format];
+	RD::DataFormat dst_rd_format = BETSY_TO_RD_FORMAT[p_format];
+
+	BetsyShaderType shader_type = FORMAT_TO_TYPE[p_format];
+	BetsyShader shader = cached_shaders[shader_type];
+	BetsyShader secondary_shader; // The secondary shader is used for alpha blocks. For BC it's BC4U and for ETC it's ETC2_RU (8-bit variant).
+	BetsyShader stitch_shader;
+	bool needs_alpha_block = false;
+
+	switch (p_format) {
+		case BETSY_FORMAT_BC3:
+		case BETSY_FORMAT_BC5_UNSIGNED:
+			needs_alpha_block = true;
+			secondary_shader = cached_shaders[BETSY_SHADER_BC4_UNSIGNED];
+			stitch_shader = cached_shaders[BETSY_SHADER_ALPHA_STITCH];
+			break;
+		default:
+			break;
+	}
+
+	// src_texture format information.
+	RD::TextureFormat src_texture_format;
+	{
+		src_texture_format.array_layers = 1;
+		src_texture_format.depth = 1;
+		src_texture_format.mipmaps = 1;
+		src_texture_format.texture_type = RD::TEXTURE_TYPE_2D;
+		src_texture_format.usage_bits = RD::TEXTURE_USAGE_SAMPLING_BIT | RD::TEXTURE_USAGE_CAN_UPDATE_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT;
+	}
+
+	err = get_src_texture_format(r_img, src_texture_format.format);
+
+	if (err != OK) {
+		return err;
+	}
+
+	// For the destination format just copy the source format and change the usage bits.
+	RD::TextureFormat dst_texture_format = src_texture_format;
+	dst_texture_format.usage_bits = RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT | RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_FROM_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT | RD::TEXTURE_USAGE_CAN_UPDATE_BIT;
+	dst_texture_format.format = dst_rd_format;
+
+	RD::TextureFormat dst_texture_format_alpha;
+	RD::TextureFormat dst_texture_format_combined;
+
+	if (needs_alpha_block) {
+		dst_texture_format_combined = dst_texture_format;
+		dst_texture_format_combined.format = RD::DATA_FORMAT_R32G32B32A32_UINT;
+
+		dst_texture_format.usage_bits |= RD::TEXTURE_USAGE_SAMPLING_BIT;
+
+		dst_texture_format_alpha = dst_texture_format;
+		dst_texture_format_alpha.format = RD::DATA_FORMAT_R32G32_UINT;
+	}
+
+	// Encoding table setup.
+	if ((dest_format == Image::FORMAT_DXT1 || dest_format == Image::FORMAT_DXT5) && dxt1_encoding_table_buffer.is_null()) {
+		Vector<uint8_t> data;
+		data.resize(1024 * 4);
+		memcpy(data.ptrw(), dxt1_encoding_table, 1024 * 4);
+
+		dxt1_encoding_table_buffer = compress_rd->storage_buffer_create(1024 * 4, data);
+	}
+
+	const int mip_count = r_img->get_mipmap_count() + 1;
+
+	// Container for the compressed data.
+	Vector<uint8_t> dst_data;
+	dst_data.resize(Image::get_image_data_size(r_img->get_width(), r_img->get_height(), dest_format, r_img->has_mipmaps()));
+	uint8_t *dst_data_ptr = dst_data.ptrw();
+
+	Vector<Vector<uint8_t>> src_images;
+	src_images.push_back(Vector<uint8_t>());
+	Vector<uint8_t> *src_image_ptr = src_images.ptrw();
+
+	// Compress each mipmap.
+	for (int i = 0; i < mip_count; i++) {
+		int64_t ofs, size;
+		int width, height;
+		r_img->get_mipmap_offset_size_and_dimensions(i, ofs, size, width, height);
+
+		// Set the source texture width and size.
+		src_texture_format.height = height;
+		src_texture_format.width = width;
+
+		// Set the destination texture width and size.
+		dst_texture_format.height = (height + 3) >> 2;
+		dst_texture_format.width = (width + 3) >> 2;
+
+		// Create a buffer filled with the source mip layer data.
+		src_image_ptr[0].resize(size);
+		memcpy(src_image_ptr[0].ptrw(), r_img->ptr() + ofs, size);
+
+		// Create the textures on the GPU.
+		RID src_texture = compress_rd->texture_create(src_texture_format, RD::TextureView(), src_images);
+		RID dst_texture_primary = compress_rd->texture_create(dst_texture_format, RD::TextureView());
+
+		{
+			Vector<RD::Uniform> uniforms;
+			{
+				{
+					RD::Uniform u;
+					u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+					u.binding = 0;
+					u.append_id(src_sampler);
+					u.append_id(src_texture);
+					uniforms.push_back(u);
+				}
+				{
+					RD::Uniform u;
+					u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
+					u.binding = 1;
+					u.append_id(dst_texture_primary);
+					uniforms.push_back(u);
+				}
+
+				if (dest_format == Image::FORMAT_DXT1 || dest_format == Image::FORMAT_DXT5) {
+					RD::Uniform u;
+					u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+					u.binding = 2;
+					u.append_id(dxt1_encoding_table_buffer);
+					uniforms.push_back(u);
+				}
+			}
+
+			RID uniform_set = compress_rd->uniform_set_create(uniforms, shader.compiled, 0);
+			RD::ComputeListID compute_list = compress_rd->compute_list_begin();
+
+			compress_rd->compute_list_bind_compute_pipeline(compute_list, shader.pipeline);
+			compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
+
+			switch (shader_type) {
+				case BETSY_SHADER_BC6_SIGNED:
+				case BETSY_SHADER_BC6_UNSIGNED: {
+					BC6PushConstant push_constant;
+					push_constant.sizeX = 1.0f / width;
+					push_constant.sizeY = 1.0f / height;
+
+					compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
+					compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
+				} break;
+
+				case BETSY_SHADER_BC1_STANDARD: {
+					BC1PushConstant push_constant;
+					push_constant.num_refines = 2;
+
+					compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant));
+					compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
+				} break;
+
+				case BETSY_SHADER_BC4_UNSIGNED: {
+					BC4PushConstant push_constant;
+					push_constant.channel_idx = 0;
+
+					compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC4PushConstant));
+					compress_rd->compute_list_dispatch(compute_list, 1, get_next_multiple(width, 16) / 16, get_next_multiple(height, 16) / 16);
+				} break;
+
+				default: {
+				} break;
+			}
+
+			compress_rd->compute_list_end();
+
+			if (!needs_alpha_block) {
+				compress_rd->submit();
+				compress_rd->sync();
+			}
+		}
+
+		RID dst_texture_rid = dst_texture_primary;
+
+		if (needs_alpha_block) {
+			// Set the destination texture width and size.
+			dst_texture_format_alpha.height = (height + 3) >> 2;
+			dst_texture_format_alpha.width = (width + 3) >> 2;
+
+			RID dst_texture_alpha = compress_rd->texture_create(dst_texture_format_alpha, RD::TextureView());
+
+			{
+				Vector<RD::Uniform> uniforms;
+				{
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+						u.binding = 0;
+						u.append_id(src_sampler);
+						u.append_id(src_texture);
+						uniforms.push_back(u);
+					}
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
+						u.binding = 1;
+						u.append_id(dst_texture_alpha);
+						uniforms.push_back(u);
+					}
+				}
+
+				RID uniform_set = compress_rd->uniform_set_create(uniforms, secondary_shader.compiled, 0);
+				RD::ComputeListID compute_list = compress_rd->compute_list_begin();
+
+				compress_rd->compute_list_bind_compute_pipeline(compute_list, secondary_shader.pipeline);
+				compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
+
+				BC4PushConstant push_constant;
+				push_constant.channel_idx = dest_format == Image::FORMAT_DXT5 ? 3 : 1;
+
+				compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC4PushConstant));
+				compress_rd->compute_list_dispatch(compute_list, 1, get_next_multiple(width, 16) / 16, get_next_multiple(height, 16) / 16);
+
+				compress_rd->compute_list_end();
+			}
+
+			// Stitching
+
+			// Set the destination texture width and size.
+			dst_texture_format_combined.height = (height + 3) >> 2;
+			dst_texture_format_combined.width = (width + 3) >> 2;
+
+			RID dst_texture_combined = compress_rd->texture_create(dst_texture_format_combined, RD::TextureView());
+
+			{
+				Vector<RD::Uniform> uniforms;
+				{
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+						u.binding = 0;
+						u.append_id(src_sampler);
+						u.append_id(dest_format == Image::FORMAT_DXT5 ? dst_texture_alpha : dst_texture_primary);
+						uniforms.push_back(u);
+					}
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+						u.binding = 1;
+						u.append_id(src_sampler);
+						u.append_id(dest_format == Image::FORMAT_DXT5 ? dst_texture_primary : dst_texture_alpha);
+						uniforms.push_back(u);
+					}
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
+						u.binding = 2;
+						u.append_id(dst_texture_combined);
+						uniforms.push_back(u);
+					}
+				}
+
+				RID uniform_set = compress_rd->uniform_set_create(uniforms, stitch_shader.compiled, 0);
+				RD::ComputeListID compute_list = compress_rd->compute_list_begin();
+
+				compress_rd->compute_list_bind_compute_pipeline(compute_list, stitch_shader.pipeline);
+				compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
+				compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
+
+				compress_rd->compute_list_end();
+
+				compress_rd->submit();
+				compress_rd->sync();
+			}
+
+			dst_texture_rid = dst_texture_combined;
+
+			compress_rd->free(dst_texture_primary);
+			compress_rd->free(dst_texture_alpha);
+		}
+
+		// Copy data from the GPU to the buffer.
+		const Vector<uint8_t> texture_data = compress_rd->texture_get_data(dst_texture_rid, 0);
+		int64_t dst_ofs = Image::get_image_mipmap_offset(r_img->get_width(), r_img->get_height(), dest_format, i);
+
+		memcpy(dst_data_ptr + dst_ofs, texture_data.ptr(), texture_data.size());
+
+		// Free the source and dest texture.
+		compress_rd->free(src_texture);
+		compress_rd->free(dst_texture_rid);
+	}
+
+	src_images.clear();
+
+	// Set the compressed data to the image.
+	r_img->set_data(r_img->get_width(), r_img->get_height(), r_img->has_mipmaps(), dest_format, dst_data);
+
+	print_verbose(
+			vformat("Betsy: Encoding a %dx%d image with %d mipmaps as %s took %d ms.",
+					r_img->get_width(),
+					r_img->get_height(),
+					r_img->get_mipmap_count(),
+					Image::get_format_name(dest_format),
+					OS::get_singleton()->get_ticks_msec() - start_time));
+
+	return OK;
+}
+
+void ensure_betsy_exists() {
+	betsy_mutex.lock();
+	if (betsy == nullptr) {
+		betsy = memnew(BetsyCompressor);
+		betsy->init();
+	}
+	betsy_mutex.unlock();
+}
+
+Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels) {
+	ensure_betsy_exists();
+	Image::Format format = r_img->get_format();
+	Error result = ERR_UNAVAILABLE;
+
+	if (format >= Image::FORMAT_RF && format <= Image::FORMAT_RGBE9995) {
+		if (r_img->detect_signed()) {
+			result = betsy->compress(BETSY_FORMAT_BC6_SIGNED, r_img);
+		} else {
+			result = betsy->compress(BETSY_FORMAT_BC6_UNSIGNED, r_img);
+		}
+	}
+
+	if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) {
+		free_device();
+	}
+
+	return result;
+}
+
+Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels) {
+	ensure_betsy_exists();
+	Error result = ERR_UNAVAILABLE;
+
+	switch (p_channels) {
+		case Image::USED_CHANNELS_RGB:
+		case Image::USED_CHANNELS_L:
+			result = betsy->compress(BETSY_FORMAT_BC1, r_img);
+			break;
+
+		case Image::USED_CHANNELS_RGBA:
+		case Image::USED_CHANNELS_LA:
+			result = betsy->compress(BETSY_FORMAT_BC3, r_img);
+			break;
+
+		case Image::USED_CHANNELS_R:
+			result = betsy->compress(BETSY_FORMAT_BC4_UNSIGNED, r_img);
+			break;
+
+		case Image::USED_CHANNELS_RG:
+			result = betsy->compress(BETSY_FORMAT_BC5_UNSIGNED, r_img);
+			break;
+
+		default:
+			break;
+	}
+
+	if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) {
+		free_device();
+	}
+
+	return result;
+}
+
+void free_device() {
+	if (betsy != nullptr) {
+		betsy->finish();
+		memdelete(betsy);
+	}
+}
--- a/engine/modules/betsy/image_compress_betsy.h
+++ b/engine/modules/betsy/image_compress_betsy.h
@ -0,0 +1,132 @@
+/**************************************************************************/
+/*  image_compress_betsy.h                                                */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef IMAGE_COMPRESS_BETSY_H
+#define IMAGE_COMPRESS_BETSY_H
+
+#include "core/io/image.h"
+#include "core/object/worker_thread_pool.h"
+#include "core/os/thread.h"
+#include "core/templates/command_queue_mt.h"
+
+#include "servers/rendering/rendering_device_binds.h"
+#include "servers/rendering/rendering_server_default.h"
+
+#if defined(VULKAN_ENABLED)
+#include "drivers/vulkan/rendering_context_driver_vulkan.h"
+#endif
+#if defined(METAL_ENABLED)
+#include "drivers/metal/rendering_context_driver_metal.h"
+#endif
+
+enum BetsyFormat {
+	BETSY_FORMAT_BC1,
+	BETSY_FORMAT_BC1_DITHER,
+	BETSY_FORMAT_BC3,
+	BETSY_FORMAT_BC4_SIGNED,
+	BETSY_FORMAT_BC4_UNSIGNED,
+	BETSY_FORMAT_BC5_SIGNED,
+	BETSY_FORMAT_BC5_UNSIGNED,
+	BETSY_FORMAT_BC6_SIGNED,
+	BETSY_FORMAT_BC6_UNSIGNED,
+	BETSY_FORMAT_MAX,
+};
+
+enum BetsyShaderType {
+	BETSY_SHADER_BC1_STANDARD,
+	BETSY_SHADER_BC1_DITHER,
+	BETSY_SHADER_BC4_SIGNED,
+	BETSY_SHADER_BC4_UNSIGNED,
+	BETSY_SHADER_BC6_SIGNED,
+	BETSY_SHADER_BC6_UNSIGNED,
+	BETSY_SHADER_ALPHA_STITCH,
+	BETSY_SHADER_MAX,
+};
+
+struct BC6PushConstant {
+	float sizeX;
+	float sizeY;
+	uint32_t padding[2] = { 0 };
+};
+
+struct BC1PushConstant {
+	uint32_t num_refines;
+	uint32_t padding[3] = { 0 };
+};
+
+struct BC4PushConstant {
+	uint32_t channel_idx;
+	uint32_t padding[3] = { 0 };
+};
+
+void free_device();
+
+Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels);
+Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels);
+
+class BetsyCompressor : public Object {
+	mutable CommandQueueMT command_queue;
+	bool exit = false;
+	WorkerThreadPool::TaskID task_id = WorkerThreadPool::INVALID_TASK_ID;
+
+	struct BetsyShader {
+		RID compiled;
+		RID pipeline;
+	};
+
+	// Resources shared by all compression formats.
+	RenderingDevice *compress_rd = nullptr;
+	RenderingContextDriver *compress_rcd = nullptr;
+	BetsyShader cached_shaders[BETSY_SHADER_MAX];
+	RID src_sampler;
+
+	// Format-specific resources.
+	RID dxt1_encoding_table_buffer;
+
+	void _init();
+	void _assign_mt_ids(WorkerThreadPool::TaskID p_pump_task_id);
+	void _thread_loop();
+	void _thread_exit();
+
+	Error _get_shader(BetsyFormat p_format, const String &p_version, BetsyShader &r_shader);
+	Error _compress(BetsyFormat p_format, Image *r_img);
+
+public:
+	void init();
+	void finish();
+
+	Error compress(BetsyFormat p_format, Image *r_img) {
+		Error err;
+		command_queue.push_and_ret(this, &BetsyCompressor::_compress, &err, p_format, r_img);
+		return err;
+	}
+};
+
+#endif // IMAGE_COMPRESS_BETSY_H
--- a/engine/modules/betsy/register_types.cpp
+++ b/engine/modules/betsy/register_types.cpp
@ -0,0 +1,50 @@
+/**************************************************************************/
+/*  register_types.cpp                                                    */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "register_types.h"
+
+#include "image_compress_betsy.h"
+
+void initialize_betsy_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+
+	Image::_image_compress_bptc_rd_func = _betsy_compress_bptc;
+	Image::_image_compress_bc_rd_func = _betsy_compress_s3tc;
+}
+
+void uninitialize_betsy_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+
+	free_device();
+}
--- a/engine/modules/betsy/register_types.h
+++ b/engine/modules/betsy/register_types.h
@ -0,0 +1,39 @@
+/**************************************************************************/
+/*  register_types.h                                                      */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef BETSY_REGISTER_TYPES_H
+#define BETSY_REGISTER_TYPES_H
+
+#include "modules/register_module_types.h"
+
+void initialize_betsy_module(ModuleInitializationLevel p_level);
+void uninitialize_betsy_module(ModuleInitializationLevel p_level);
+
+#endif // BETSY_REGISTER_TYPES_H