Merge pull request #116542 from Ivorforce/char-range-cpp

Move char ranges from `char_range.inc` to `char_range.cpp` (non-constexpr)
2026-02-21 12:17:02 -06:00 · 2026-02-21 12:17:02 -06:00 · a3e84cc2af
commit a3e84cc2af
parent 9a19d32fdb 9dfc2c977b
4 changed files with 50 additions and 43 deletions
--- a/core/string/char_range.cpp
+++ b/core/string/char_range.cpp
@ -1,5 +1,5 @@
 /**************************************************************************/
-/*  char_range.inc                                                        */
+/*  char_range.cpp                                                        */
 /**************************************************************************/
 /*                         This file is part of:                          */
 /*                             GODOT ENGINE                               */
@ -30,19 +30,13 @@

 // This file was generated using the `misc/scripts/char_range_fetch.py` script.

-#pragma once
-
-#include "core/typedefs.h"
+#include "core/string/char_utils.h"

 // Unicode Derived Core Properties
 // Source: https://www.unicode.org/Public/17.0.0/ucd/DerivedCoreProperties.txt

-struct CharRange {
-	char32_t start;
-	char32_t end;
-};
-
-constexpr inline CharRange xid_start[] = {
+const int xid_start_size = 692;
+const CharRange xid_start[xid_start_size] = {
 	{ 0x41, 0x5a },
 	{ 0x5f, 0x5f },
 	{ 0x61, 0x7a },
@ -737,7 +731,8 @@ constexpr inline CharRange xid_start[] = {
 	{ 0x31350, 0x33479 },
 };

-constexpr inline CharRange xid_continue[] = {
+const int xid_continue_size = 806;
+const CharRange xid_continue[xid_continue_size] = {
 	{ 0x30, 0x39 },
 	{ 0x41, 0x5a },
 	{ 0x5f, 0x5f },
@ -1546,7 +1541,8 @@ constexpr inline CharRange xid_continue[] = {
 	{ 0xe0100, 0xe01ef },
 };

-constexpr inline CharRange uppercase_letter[] = {
+const int uppercase_letter_size = 660;
+const CharRange uppercase_letter[uppercase_letter_size] = {
 	{ 0x41, 0x5a },
 	{ 0xc0, 0xd6 },
 	{ 0xd8, 0xde },
@ -2209,7 +2205,8 @@ constexpr inline CharRange uppercase_letter[] = {
 	{ 0x1f170, 0x1f189 },
 };

-constexpr inline CharRange lowercase_letter[] = {
+const int lowercase_letter_size = 677;
+const CharRange lowercase_letter[lowercase_letter_size] = {
 	{ 0x61, 0x7a },
 	{ 0xaa, 0xaa },
 	{ 0xb5, 0xb5 },
@ -2889,7 +2886,8 @@ constexpr inline CharRange lowercase_letter[] = {
 	{ 0x1e922, 0x1e943 },
 };

-constexpr inline CharRange unicode_letter[] = {
+const int unicode_letter_size = 761;
+const CharRange unicode_letter[unicode_letter_size] = {
 	{ 0x41, 0x5a },
 	{ 0x61, 0x7a },
 	{ 0xaa, 0xaa },
--- a/core/string/char_utils.h
+++ b/core/string/char_utils.h
@ -32,14 +32,28 @@

 #include "core/typedefs.h"

-#include "char_range.inc"
-
 static constexpr char hex_char_table_upper[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
 static constexpr char hex_char_table_lower[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };

-#define BSEARCH_CHAR_RANGE(m_array) \
+struct CharRange {
+	char32_t start;
+	char32_t end;
+};
+
+extern const CharRange xid_start[];
+extern const int xid_start_size;
+extern const CharRange xid_continue[];
+extern const int xid_continue_size;
+extern const CharRange uppercase_letter[];
+extern const int uppercase_letter_size;
+extern const CharRange lowercase_letter[];
+extern const int lowercase_letter_size;
+extern const CharRange unicode_letter[];
+extern const int unicode_letter_size;
+
+#define BSEARCH_CHAR_RANGE(m_array, m_size) \
 	int low = 0; \
-	int high = std_size(m_array) - 1; \
+	int high = m_size - 1; \
 	int middle = (low + high) / 2; \
 \
 	while (low <= high) { \
@ -56,24 +70,24 @@ static constexpr char hex_char_table_lower[16] = { '0', '1', '2', '3', '4', '5',
 \
 	return false

-constexpr bool is_unicode_identifier_start(char32_t p_char) {
-	BSEARCH_CHAR_RANGE(xid_start);
+inline bool is_unicode_identifier_start(char32_t p_char) {
+	BSEARCH_CHAR_RANGE(xid_start, xid_start_size);
 }

-constexpr bool is_unicode_identifier_continue(char32_t p_char) {
-	BSEARCH_CHAR_RANGE(xid_continue);
+inline bool is_unicode_identifier_continue(char32_t p_char) {
+	BSEARCH_CHAR_RANGE(xid_continue, xid_continue_size);
 }

-constexpr bool is_unicode_upper_case(char32_t p_char) {
-	BSEARCH_CHAR_RANGE(uppercase_letter);
+inline bool is_unicode_upper_case(char32_t p_char) {
+	BSEARCH_CHAR_RANGE(uppercase_letter, uppercase_letter_size);
 }

-constexpr bool is_unicode_lower_case(char32_t p_char) {
-	BSEARCH_CHAR_RANGE(lowercase_letter);
+inline bool is_unicode_lower_case(char32_t p_char) {
+	BSEARCH_CHAR_RANGE(lowercase_letter, lowercase_letter_size);
 }

-constexpr bool is_unicode_letter(char32_t p_char) {
-	BSEARCH_CHAR_RANGE(unicode_letter);
+inline bool is_unicode_letter(char32_t p_char) {
+	BSEARCH_CHAR_RANGE(unicode_letter, unicode_letter_size);
 }

 #undef BSEARCH_CHAR_RANGE
--- a/misc/scripts/char_range_fetch.py
+++ b/misc/scripts/char_range_fetch.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3

 # Script used to dump char ranges for specific properties from
-# the Unicode Character Database to the `char_range.inc` file.
+# the Unicode Character Database to the `char_range.cpp` file.
 # NOTE: This script is deliberately not integrated into the build system;
 # you should run it manually whenever you want to update the data.
 from __future__ import annotations
@ -89,7 +89,8 @@ def parse_unicode_data() -> None:


 def make_array(array_name: str, range_list: list[tuple[int, int]]) -> str:
-    result: str = f"\n\nconstexpr inline CharRange {array_name}[] = {{\n"
+    result: str = f"\n\nconst int {array_name}_size = {len(range_list)};\n"
+    result += f"const CharRange {array_name}[{array_name}_size] = {{\n"

    for start, end in range_list:
        result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
@ -102,22 +103,16 @@ def make_array(array_name: str, range_list: list[tuple[int, int]]) -> str:
 def generate_char_range_inc() -> None:
    parse_unicode_data()

-    source: str = generate_copyright_header("char_range.inc")
+    source: str = generate_copyright_header("char_range.cpp")

    source += f"""
 // This file was generated using the `misc/scripts/char_range_fetch.py` script.

-#pragma once
-
-#include "core/typedefs.h"
+#include "core/string/char_utils.h"

 // Unicode Derived Core Properties
-// Source: {URL}
-
-struct CharRange {{
-\tchar32_t start;
-\tchar32_t end;
-}};"""
+// Source: {URL}\
+"""

    source += make_array("xid_start", xid_start)
    source += make_array("xid_continue", xid_continue)
@ -127,11 +122,11 @@ struct CharRange {{

    source += "\n"

-    char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
+    char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.cpp")
    with open(char_range_path, "w", newline="\n") as f:
        f.write(source)

-    print("`char_range.inc` generated successfully.")
+    print("`char_range.cpp` generated successfully.")


 if __name__ == "__main__":
--- a/misc/scripts/unicode_ranges_fetch.py
+++ b/misc/scripts/unicode_ranges_fetch.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3

 # Script used to dump char ranges from
-# the Unicode Character Database to the `char_range.inc` file.
+# the Unicode Character Database to the `unicode_ranges.inc` file.
 # NOTE: This script is deliberately not integrated into the build system;
 # you should run it manually whenever you want to update the data.
 from __future__ import annotations