Module:Unicode data: Difference between revisions

From Zoophilia Wiki
Jump to navigationJump to search
meta>Erutuon
(fix Hangul Syllable function)
m (1 revision imported)
 
(57 intermediate revisions by 8 users not shown)
Line 1: Line 1:
local export = {}
local p = {}


local floor = math.floor
local floor = math.floor


-- The following leads, vowels, and trails come from here:
local function errorf(level, ...)
-- http://www.unicode.org/Public/UNIDATA/Jamo.txt
if type(level) == "number" then
return error(string.format(...), level + 1)
else -- level is actually the format string.
return error(string.format(level, ...), 2)
end
end
 
local function binary_range_search(codepoint, ranges)
local low, mid, high
low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
while low <= high do
mid = floor((low + high) / 2)
local range = ranges[mid]
if codepoint < range[1] then
high = mid - 1
elseif codepoint <= range[2] then
return range, mid
else
low = mid + 1
end
end
return nil, mid
end
p.binary_range_search = binary_range_search
 
--[[
local function linear_range_search(codepoint, ranges)
for i, range in ipairs(ranges) do
if range[1] <= codepoint and codepoint <= range[2] then
return range
end
end
end
--]]
 
-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
__index = function (self, key)
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
if not success then
data = false
end
self[key] = data
return data
end
})


-- For the algorithm used to generate Hangul Syllable names,
-- For the algorithm used to generate Hangul Syllable names,
Line 10: Line 58:
-- Unicode Specification:
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local hangul_leads = {
[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS",
"", "J", "JJ", "C", "K", "T", "P", "H"
}
hangul_leads.length = #hangul_leads + 1
local hangul_vowels = {
[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
"I"
}
hangul_vowel_count = #hangul_vowels + 1
local hangul_trails = {
[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K",
"T", "P", "H"
}
hangul_trail_count = #hangul_trails + 1
hangul_codas = hangul_vowel_count * hangul_trail_count
local name_hooks = {
local name_hooks = {
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
{    0x00,    0x1F, "<control-%04X>" }, -- C0 control characters
{    0x7F,    0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{    0x7F,    0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{  0x3400,  0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{  0x3400,  0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{  0x4E00,  0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph --change v10
{  0x4E00,  0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{  0xAC00,  0xD7A3, function (codepoint)
{  0xAC00,  0xD7A3, function (codepoint) -- Hangul Syllables
local Hangul_data = loader.Hangul
local syllable_index = codepoint - 0xAC00
local syllable_index = codepoint - 0xAC00


return ("HANGUL SYLLABLE %s%s%s"):format(
return ("HANGUL SYLLABLE %s%s%s"):format(
hangul_leads[floor(syllable_index / hangul_codas)],
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
hangul_vowels[floor((syllable_index % hangul_codas) / hangul_trail_count)],
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
hangul_trails[syllable_index % hangul_trail_count]
/ Hangul_data.trail_count)],
Hangul_data.trails[syllable_index % Hangul_data.trail_count]
)
)
end },
end },
{  0xD800,  0xDB7F, "<surrogate-%04X>" }, -- Non Private Use High Surrogate
-- High Surrogates, High Private Use Surrogates, Low Surrogates
{  0xDB80,   0xDBFF, "<surrogate-%04X>" }, -- Private Use High Surrogate
0xD800,  0xDFFF, "<surrogate-%04X>" },
0xDC00,  0xDFFF, "<surrogate-%04X>" }, -- Low Surrogate
{  0xE000,  0xF8FF, "<private-use-%04X>" }, -- Private Use
{  0xE000,  0xF8FF, "<private-use-%04X>" }, -- Private Use
{  0x17000,  0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut
-- CJK Compatibility Ideographs
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%05X" }, -- Nushu --add v10
{  0xF900,  0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x20000,  0x2A6D6, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension B
{  0xFA70,  0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension C
{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D
{  0x18800,  0x18AFF, function (codepoint)
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F --add v10
end },
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
{  0xF0000,  0xFFFFD, "<private-use-%05X>" }, -- Plane 15 Private Use
{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{ 0x100000, 0x10FFFD, "<private-use-%06X>" }  -- Plane 16 Private Use
{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
end},
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
}
}
name_hooks.length = #name_hooks


local name_range_cache
local name_range_cache
Line 71: Line 113:
end
end
end
end
--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
if codepoint < 0 or 0x10FFFF < codepoint then
errorf("Codepoint %04X out of range", codepoint)
end
end
--]]


-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function export.lookup_name(codepoint)
function p.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are noncharacters:
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
-- https://www.unicode.org/faq/private_use.html#nonchar4
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or math.floor(codepoint % 0x10000) >= 0xFFFE) then
or floor(codepoint % 0x10000) >= 0xFFFE) then
return ("<noncharacter-%04X>"):format(codepoint)
return ("<noncharacter-%04X>"):format(codepoint)
end
end


if name_range_cache then
if name_range_cache -- Check if previously used "name hook" applies to this code point.
if (codepoint >= name_range_cache[1]) and (codepoint <= name_range_cache[2]) then
and codepoint >= name_range_cache[1]
return generate_name(name_range_cache[3], codepoint)
and codepoint <= name_range_cache[2] then
end
return generate_name(name_range_cache[3], codepoint)
end
end
 
for _, item in ipairs(name_hooks) do
local range = binary_range_search(codepoint, name_hooks)
if codepoint < item[1] then
if range then
break
name_range_cache = range
elseif codepoint <= item[2] then
return generate_name(range[3], codepoint)
name_range_cache = item
return generate_name(item[3], codepoint)
end
end
end


local success, data = pcall(mw.loadData,
local data = loader[('names/%03X'):format(codepoint / 0x1000)]
('Module:Unicode data/names/%03X'):format(codepoint / 0x1000))
if success and data[codepoint] then
if data and data[codepoint] then
return data[codepoint]
return data[codepoint]
-- Unassigned (Cn) includes noncharacters and reserved characters.
-- Unassigned (Cn) consists of noncharacters and reserved characters.
-- The character is not a noncharacter and if it were assigned, its name
-- The character has been established not to be a noncharacter,
-- would already been retrieved, so it must be reserved.
-- and if it were assigned, its name would already been retrieved,
-- so it must be reserved.
else
else
return ("<reserved-%04X>"):format(codepoint)
return ("<reserved-%04X>"):format(codepoint)
Line 110: Line 162:
end
end


function export.lookup_image(codepoint)
function p.lookup_image(codepoint)
local success, data = pcall(mw.loadData,
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
('Module:Unicode data/images/%03X'):format(codepoint / 0x1000)
)
if success then
if data then
return data[codepoint]
return data[codepoint]
end
end
end
function export.template_lookup_name(frame)
local codepoint = tonumber(frame.args[1] or frame:getParent().args[1])
local name = export.lookup_name(codepoint)
return name:gsub("<", "&lt;")
end
end


Line 130: Line 174:
[ 1] = "Supplementary Multilingual Plane";
[ 1] = "Supplementary Multilingual Plane";
[ 2] = "Supplementary Ideographic Plane";
[ 2] = "Supplementary Ideographic Plane";
[13] = "Supplementary Special-purpose Plane";
[ 3] = "Tertiary Ideographic Plane";
[14] = "Supplementary Private Use Area-A";
[14] = "Supplementary Special-purpose Plane";
[15] = "Supplementary Private Use Area-B";
[15] = "Supplementary Private Use Area-A";
[16] = "Supplementary Private Use Area-B";
}
}


-- http://www.unicode.org/Public/UNIDATA/Blocks.txt
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
-- This should be kept synchronized with [[Module:category tree/scriptcatboiler/blocks]].
local blocks
local blocks = {
 
{ "Basic Latin",                                    0x000000, 0x00007F },
local function block_iter(blocks, i)
{ "Latin-1 Supplement",                              0x000080, 0x0000FF },
i = i + 1
{ "Latin Extended-A",                                0x000100, 0x00017F },
local data = blocks[i]
{ "Latin Extended-B",                                0x000180, 0x00024F },
if data then
{ "IPA Extensions",                                  0x000250, 0x0002AF },
-- Unpack doesn't work on tables loaded with mw.loadData.
{ "Spacing Modifier Letters",                        0x0002B0, 0x0002FF },
return i, data[1], data[2], data[3]
{ "Combining Diacritical Marks",                    0x000300, 0x00036F },
end
{ "Greek and Coptic",                                0x000370, 0x0003FF },
end
{ "Cyrillic",                                        0x000400, 0x0004FF },
{ "Cyrillic Supplement",                            0x000500, 0x00052F },
{ "Armenian",                                        0x000530, 0x00058F },
{ "Hebrew",                                          0x000590, 0x0005FF },
{ "Arabic",                                          0x000600, 0x0006FF },
{ "Syriac",                                          0x000700, 0x00074F },
{ "Arabic Supplement",                              0x000750, 0x00077F },
{ "Thaana",                                          0x000780, 0x0007BF },
{ "NKo",                                            0x0007C0, 0x0007FF },
{ "Samaritan",                                      0x000800, 0x00083F },
{ "Mandaic",                                        0x000840, 0x00085F },
{ "Syriac Supplement",                              0x000860, 0x00086F },
{ "Arabic Extended-A",                              0x0008A0, 0x0008FF },
{ "Devanagari",                                      0x000900, 0x00097F },
{ "Bengali",                                        0x000980, 0x0009FF },
{ "Gurmukhi",                                        0x000A00, 0x000A7F },
{ "Gujarati",                                        0x000A80, 0x000AFF },
{ "Oriya",                                          0x000B00, 0x000B7F },
{ "Tamil",                                          0x000B80, 0x000BFF },
{ "Telugu",                                          0x000C00, 0x000C7F },
{ "Kannada",                                        0x000C80, 0x000CFF },
{ "Malayalam",                                      0x000D00, 0x000D7F },
{ "Sinhala",                                        0x000D80, 0x000DFF },
{ "Thai",                                            0x000E00, 0x000E7F },
{ "Lao",                                            0x000E80, 0x000EFF },
{ "Tibetan",                                        0x000F00, 0x000FFF },
{ "Myanmar",                                        0x001000, 0x00109F },
{ "Georgian",                                        0x0010A0, 0x0010FF },
{ "Hangul Jamo",                                    0x001100, 0x0011FF },
{ "Ethiopic",                                        0x001200, 0x00137F },
{ "Ethiopic Supplement",                            0x001380, 0x00139F },
{ "Cherokee",                                        0x0013A0, 0x0013FF },
{ "Unified Canadian Aboriginal Syllabics",          0x001400, 0x00167F },
{ "Ogham",                                          0x001680, 0x00169F },
{ "Runic",                                          0x0016A0, 0x0016FF },
{ "Tagalog",                                        0x001700, 0x00171F },
{ "Hanunoo",                                        0x001720, 0x00173F },
{ "Buhid",                                          0x001740, 0x00175F },
{ "Tagbanwa",                                        0x001760, 0x00177F },
{ "Khmer",                                          0x001780, 0x0017FF },
{ "Mongolian",                                      0x001800, 0x0018AF },
{ "Unified Canadian Aboriginal Syllabics Extended",  0x0018B0, 0x0018FF },
{ "Limbu",                                          0x001900, 0x00194F },
{ "Tai Le",                                          0x001950, 0x00197F },
{ "New Tai Lue",                                    0x001980, 0x0019DF },
{ "Khmer Symbols",                                  0x0019E0, 0x0019FF },
{ "Buginese",                                        0x001A00, 0x001A1F },
{ "Tai Tham",                                        0x001A20, 0x001AAF },
{ "Combining Diacritical Marks Extended",            0x001AB0, 0x001AFF },
{ "Balinese",                                        0x001B00, 0x001B7F },
{ "Sundanese",                                      0x001B80, 0x001BBF },
{ "Batak",                                          0x001BC0, 0x001BFF },
{ "Lepcha",                                          0x001C00, 0x001C4F },
{ "Ol Chiki",                                        0x001C50, 0x001C7F },
{ "Cyrillic Extended-C",                            0x001C80, 0x001C8F },
{ "Georgian Extended",                               0x001C90, 0x001CBF },
{ "Sundanese Supplement",                            0x001CC0, 0x001CCF },
{ "Vedic Extensions",                                0x001CD0, 0x001CFF },
{ "Phonetic Extensions",                            0x001D00, 0x001D7F },
{ "Phonetic Extensions Supplement",                  0x001D80, 0x001DBF },
{ "Combining Diacritical Marks Supplement",          0x001DC0, 0x001DFF },
{ "Latin Extended Additional",                      0x001E00, 0x001EFF },
{ "Greek Extended",                                  0x001F00, 0x001FFF },
{ "General Punctuation",                            0x002000, 0x00206F },
{ "Superscripts and Subscripts",                    0x002070, 0x00209F },
{ "Currency Symbols",                                0x0020A0, 0x0020CF },
{ "Combining Diacritical Marks for Symbols",        0x0020D0, 0x0020FF },
{ "Letterlike Symbols",                              0x002100, 0x00214F },
{ "Number Forms",                                    0x002150, 0x00218F },
{ "Arrows",                                          0x002190, 0x0021FF },
{ "Mathematical Operators",                          0x002200, 0x0022FF },
{ "Miscellaneous Technical",                        0x002300, 0x0023FF },
{ "Control Pictures",                                0x002400, 0x00243F },
{ "Optical Character Recognition",                  0x002440, 0x00245F },
{ "Enclosed Alphanumerics",                          0x002460, 0x0024FF },
{ "Box Drawing",                                    0x002500, 0x00257F },
{ "Block Elements",                                  0x002580, 0x00259F },
{ "Geometric Shapes",                                0x0025A0, 0x0025FF },
{ "Miscellaneous Symbols",                          0x002600, 0x0026FF },
{ "Dingbats",                                        0x002700, 0x0027BF },
{ "Miscellaneous Mathematical Symbols-A",            0x0027C0, 0x0027EF },
{ "Supplemental Arrows-A",                          0x0027F0, 0x0027FF },
{ "Braille Patterns",                                0x002800, 0x0028FF },
{ "Supplemental Arrows-B",                          0x002900, 0x00297F },
{ "Miscellaneous Mathematical Symbols-B",            0x002980, 0x0029FF },
{ "Supplemental Mathematical Operators",            0x002A00, 0x002AFF },
{ "Miscellaneous Symbols and Arrows",                0x002B00, 0x002BFF },
{ "Glagolitic",                                      0x002C00, 0x002C5F },
{ "Latin Extended-C",                                0x002C60, 0x002C7F },
{ "Coptic",                                          0x002C80, 0x002CFF },
{ "Georgian Supplement",                            0x002D00, 0x002D2F },
{ "Tifinagh",                                        0x002D30, 0x002D7F },
{ "Ethiopic Extended",                              0x002D80, 0x002DDF },
{ "Cyrillic Extended-A",                            0x002DE0, 0x002DFF },
{ "Supplemental Punctuation",                        0x002E00, 0x002E7F },
{ "CJK Radicals Supplement",                        0x002E80, 0x002EFF },
{ "Kangxi Radicals",                                0x002F00, 0x002FDF },
{ "Ideographic Description Characters",              0x002FF0, 0x002FFF },
{ "CJK Symbols and Punctuation",                    0x003000, 0x00303F },
{ "Hiragana",                                        0x003040, 0x00309F },
{ "Katakana",                                        0x0030A0, 0x0030FF },
{ "Bopomofo",                                        0x003100, 0x00312F },
{ "Hangul Compatibility Jamo",                      0x003130, 0x00318F },
{ "Kanbun",                                          0x003190, 0x00319F },
{ "Bopomofo Extended",                              0x0031A0, 0x0031BF },
{ "CJK Strokes",                                    0x0031C0, 0x0031EF },
{ "Katakana Phonetic Extensions",                    0x0031F0, 0x0031FF },
{ "Enclosed CJK Letters and Months",                0x003200, 0x0032FF },
{ "CJK Compatibility",                              0x003300, 0x0033FF },
{ "CJK Unified Ideographs Extension A",              0x003400, 0x004DBF },
{ "Yijing Hexagram Symbols",                        0x004DC0, 0x004DFF },
{ "CJK Unified Ideographs",                          0x004E00, 0x009FFF },
{ "Yi Syllables",                                    0x00A000, 0x00A48F },
{ "Yi Radicals",                                    0x00A490, 0x00A4CF },
{ "Lisu",                                            0x00A4D0, 0x00A4FF },
{ "Vai",                                            0x00A500, 0x00A63F },
{ "Cyrillic Extended-B",                            0x00A640, 0x00A69F },
{ "Bamum",                                          0x00A6A0, 0x00A6FF },
{ "Modifier Tone Letters",                          0x00A700, 0x00A71F },
{ "Latin Extended-D",                                0x00A720, 0x00A7FF },
{ "Syloti Nagri",                                    0x00A800, 0x00A82F },
{ "Common Indic Number Forms",                      0x00A830, 0x00A83F },
{ "Phags-pa",                                        0x00A840, 0x00A87F },
{ "Saurashtra",                                      0x00A880, 0x00A8DF },
{ "Devanagari Extended",                            0x00A8E0, 0x00A8FF },
{ "Kayah Li",                                        0x00A900, 0x00A92F },
{ "Rejang",                                          0x00A930, 0x00A95F },
{ "Hangul Jamo Extended-A",                          0x00A960, 0x00A97F },
{ "Javanese",                                        0x00A980, 0x00A9DF },
{ "Myanmar Extended-B",                              0x00A9E0, 0x00A9FF },
{ "Cham",                                            0x00AA00, 0x00AA5F },
{ "Myanmar Extended-A",                              0x00AA60, 0x00AA7F },
{ "Tai Viet",                                        0x00AA80, 0x00AADF },
{ "Meetei Mayek Extensions",                        0x00AAE0, 0x00AAFF },
{ "Ethiopic Extended-A",                            0x00AB00, 0x00AB2F },
{ "Latin Extended-E",                                0x00AB30, 0x00AB6F },
{ "Cherokee Supplement",                            0x00AB70, 0x00ABBF },
{ "Meetei Mayek",                                    0x00ABC0, 0x00ABFF },
{ "Hangul Syllables",                                0x00AC00, 0x00D7AF },
{ "Hangul Jamo Extended-B",                          0x00D7B0, 0x00D7FF },
{ "High Surrogates",                                0x00D800, 0x00DB7F },
{ "High Private Use Surrogates",                    0x00DB80, 0x00DBFF },
{ "Low Surrogates",                                  0x00DC00, 0x00DFFF },
{ "Private Use Area",                                0x00E000, 0x00F8FF },
{ "CJK Compatibility Ideographs",                    0x00F900, 0x00FAFF },
{ "Alphabetic Presentation Forms",                  0x00FB00, 0x00FB4F },
{ "Arabic Presentation Forms-A",                    0x00FB50, 0x00FDFF },
{ "Variation Selectors",                            0x00FE00, 0x00FE0F },
{ "Vertical Forms",                                  0x00FE10, 0x00FE1F },
{ "Combining Half Marks",                            0x00FE20, 0x00FE2F },
{ "CJK Compatibility Forms",                        0x00FE30, 0x00FE4F },
{ "Small Form Variants",                            0x00FE50, 0x00FE6F },
{ "Arabic Presentation Forms-B",                    0x00FE70, 0x00FEFF },
{ "Halfwidth and Fullwidth Forms",                  0x00FF00, 0x00FFEF },
{ "Specials",                                        0x00FFF0, 0x00FFFF },
{ "Linear B Syllabary",                              0x010000, 0x01007F },
{ "Linear B Ideograms",                              0x010080, 0x0100FF },
{ "Aegean Numbers",                                  0x010100, 0x01013F },
{ "Ancient Greek Numbers",                          0x010140, 0x01018F },
{ "Ancient Symbols",                                0x010190, 0x0101CF },
{ "Phaistos Disc",                                  0x0101D0, 0x0101FF },
{ "Lycian",                                          0x010280, 0x01029F },
{ "Carian",                                          0x0102A0, 0x0102DF },
{ "Coptic Epact Numbers",                            0x0102E0, 0x0102FF },
{ "Old Italic",                                      0x010300, 0x01032F },
{ "Gothic",                                          0x010330, 0x01034F },
{ "Old Permic",                                      0x010350, 0x01037F },
{ "Ugaritic",                                        0x010380, 0x01039F },
{ "Old Persian",                                    0x0103A0, 0x0103DF },
{ "Deseret",                                        0x010400, 0x01044F },
{ "Shavian",                                        0x010450, 0x01047F },
{ "Osmanya",                                        0x010480, 0x0104AF },
{ "Osage",                                          0x0104B0, 0x0104FF },
{ "Elbasan",                                        0x010500, 0x01052F },
{ "Caucasian Albanian",                              0x010530, 0x01056F },
{ "Linear A",                                        0x010600, 0x01077F },
{ "Cypriot Syllabary",                              0x010800, 0x01083F },
{ "Imperial Aramaic",                                0x010840, 0x01085F },
{ "Palmyrene",                                      0x010860, 0x01087F },
{ "Nabataean",                                      0x010880, 0x0108AF },
{ "Hatran",                                          0x0108E0, 0x0108FF },
{ "Phoenician",                                      0x010900, 0x01091F },
{ "Lydian",                                          0x010920, 0x01093F },
{ "Meroitic Hieroglyphs",                            0x010980, 0x01099F },
{ "Meroitic Cursive",                                0x0109A0, 0x0109FF },
{ "Kharoshthi",                                      0x010A00, 0x010A5F },
{ "Old South Arabian",                              0x010A60, 0x010A7F },
{ "Old North Arabian",                              0x010A80, 0x010A9F },
{ "Manichaean",                                      0x010AC0, 0x010AFF },
{ "Avestan",                                        0x010B00, 0x010B3F },
{ "Inscriptional Parthian",                          0x010B40, 0x010B5F },
{ "Inscriptional Pahlavi",                          0x010B60, 0x010B7F },
{ "Psalter Pahlavi",                                0x010B80, 0x010BAF },
{ "Old Turkic",                                      0x010C00, 0x010C4F },
{ "Old Hungarian",                                  0x010C80, 0x010CFF },
{ "Hanifi Rohingya",                                0x010D00, 0x010D3F },
{ "Rumi Numeral Symbols",                            0x010E60, 0x010E7F },
{ "Old Sogdian",                                    0x010F00, 0x010F2F },
{ "Sogdian",                                        0x010F30, 0x010F6F },
{ "Brahmi",                                          0x011000, 0x01107F },
{ "Kaithi",                                          0x011080, 0x0110CF },
{ "Sora Sompeng",                                    0x0110D0, 0x0110FF },
{ "Chakma",                                          0x011100, 0x01114F },
{ "Mahajani",                                        0x011150, 0x01117F },
{ "Sharada",                                        0x011180, 0x0111DF },
{ "Sinhala Archaic Numbers",                        0x0111E0, 0x0111FF },
{ "Khojki",                                          0x011200, 0x01124F },
{ "Multani",                                        0x011280, 0x0112AF },
{ "Khudawadi",                                      0x0112B0, 0x0112FF },
{ "Grantha",                                        0x011300, 0x01137F },
{ "Newa",                                            0x011400, 0x01147F },
{ "Tirhuta",                                        0x011480, 0x0114DF },
{ "Siddham",                                        0x011580, 0x0115FF },
{ "Modi",                                            0x011600, 0x01165F },
{ "Mongolian Supplement",                            0x011660, 0x01167F },
{ "Takri",                                          0x011680, 0x0116CF },
{ "Ahom",                                            0x011700, 0x01173F },
{ "Dogra",                                          0x011800, 0x01184F },
{ "Warang Citi",                                    0x0118A0, 0x0118FF },
{ "Zanabazar Square",                                0x011A00, 0x011A4F },
{ "Soyombo",                                        0x011A50, 0x011AAF },
{ "Pau Cin Hau",                                    0x011AC0, 0x011AFF },
{ "Bhaiksuki",                                      0x011C00, 0x011C6F },
{ "Marchen",                                        0x011C70, 0x011CBF },
{ "Masaram Gondi",                                  0x011D00, 0x011D5F },
{ "Gunjala Gondi",                                  0x011D60, 0x011DAF },
{ "Makasar",                                        0x011EE0, 0x011EFF },
{ "Cuneiform",                                      0x012000, 0x0123FF },
{ "Cuneiform Numbers and Punctuation",              0x012400, 0x01247F },
{ "Early Dynastic Cuneiform",                        0x012480, 0x01254F },
{ "Egyptian Hieroglyphs",                            0x013000, 0x01342F },
{ "Anatolian Hieroglyphs",                          0x014400, 0x01467F },
{ "Bamum Supplement",                                0x016800, 0x016A3F },
{ "Mro",                                            0x016A40, 0x016A6F },
{ "Bassa Vah",                                      0x016AD0, 0x016AFF },
{ "Pahawh Hmong",                                    0x016B00, 0x016B8F },
{ "Medefaidrin",                                    0x016E40, 0x016E9F },
{ "Miao",                                            0x016F00, 0x016F9F },
{ "Ideographic Symbols and Punctuation",            0x016FE0, 0x016FFF },
{ "Tangut",                                          0x017000, 0x0187FF },
{ "Tangut Components",                              0x018800, 0x018AFF },
{ "Kana Supplement",                                0x01B000, 0x01B0FF },
{ "Kana Extended-A",                                0x01B100, 0x01B12F },
{ "Nushu",                                          0x01B170, 0x01B2FF },
{ "Duployan",                                        0x01BC00, 0x01BC9F },
{ "Shorthand Format Controls",                      0x01BCA0, 0x01BCAF },
{ "Byzantine Musical Symbols",                      0x01D000, 0x01D0FF },
{ "Musical Symbols",                                0x01D100, 0x01D1FF },
{ "Ancient Greek Musical Notation",                  0x01D200, 0x01D24F },
{ "Mayan Numerals",                                  0x01D2E0, 0x01D2FF },
{ "Tai Xuan Jing Symbols",                          0x01D300, 0x01D35F },
{ "Counting Rod Numerals",                          0x01D360, 0x01D37F },
{ "Mathematical Alphanumeric Symbols",              0x01D400, 0x01D7FF },
{ "Sutton SignWriting",                              0x01D800, 0x01DAAF },
{ "Glagolitic Supplement",                          0x01E000, 0x01E02F },
{ "Mende Kikakui",                                  0x01E800, 0x01E8DF },
{ "Adlam",                                          0x01E900, 0x01E95F },
{ "Indic Siyaq Numbers",                            0x01EC70, 0x01ECBF },
{ "Arabic Mathematical Alphabetic Symbols",          0x01EE00, 0x01EEFF },
{ "Mahjong Tiles",                                  0x01F000, 0x01F02F },
{ "Domino Tiles",                                    0x01F030, 0x01F09F },
{ "Playing Cards",                                  0x01F0A0, 0x01F0FF },
{ "Enclosed Alphanumeric Supplement",                0x01F100, 0x01F1FF },
{ "Enclosed Ideographic Supplement",                0x01F200, 0x01F2FF },
{ "Miscellaneous Symbols and Pictographs",          0x01F300, 0x01F5FF },
{ "Emoticons",                                      0x01F600, 0x01F64F },
{ "Ornamental Dingbats",                            0x01F650, 0x01F67F },
{ "Transport and Map Symbols",                      0x01F680, 0x01F6FF },
{ "Alchemical Symbols",                              0x01F700, 0x01F77F },
{ "Geometric Shapes Extended",                      0x01F780, 0x01F7FF },
{ "Supplemental Arrows-C",                          0x01F800, 0x01F8FF },
{ "Supplemental Symbols and Pictographs",            0x01F900, 0x01F9FF },
{ "Chess Symbols",                                  0x01FA00, 0x01FA6F },
{ "CJK Unified Ideographs Extension B",              0x020000, 0x02A6DF },
{ "CJK Unified Ideographs Extension C",              0x02A700, 0x02B73F },
{ "CJK Unified Ideographs Extension D",              0x02B740, 0x02B81F },
{ "CJK Unified Ideographs Extension E",              0x02B820, 0x02CEAF },
{ "CJK Unified Ideographs Extension F",              0x02CEB0, 0x02EBEF },
{ "CJK Compatibility Ideographs Supplement",        0x02F800, 0x02FA1F },
{ "Tags",                                            0x0E0000, 0x0E007F },
{ "Variation Selectors Supplement",                  0x0E0100, 0x0E01EF },
{ "Supplementary Private Use Area-A",                0x0F0000, 0x0FFFFF },
{ "Supplementary Private Use Area-B",                0x100000, 0x10FFFF },
}
blocks.length = #blocks


function export.enum_blocks()
-- An ipairs-type iterator generator for the list of blocks.
return function (blocks, i)
function p.enum_blocks()
i = i + 1
local blocks = loader.blocks
local data = blocks[i]
return block_iter, blocks, 0
if not data then
return nil
end
return i, unpack(data)
end, blocks, 0
end
end


function export.lookup_plane(codepoint)
function p.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
local i = floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
return planes[i] or ("Plane %u"):format(i)
end
end


-- Binary search, to avoid iterating over entire table in order to look up the
function p.lookup_block(codepoint)
-- higher codepoints.
local blocks = loader.blocks
function export.lookup_block(codepoint)
local range = binary_range_search(codepoint, blocks)
local iStart, iEnd = 1, blocks.length or #blocks
if range then
while iStart <= iEnd do
return range[3]
local iMid = floor((iStart + iEnd) / 2)
else
local range = blocks[iMid]
return "No Block"
if codepoint < range[2] then
iEnd = iMid - 1
elseif codepoint <= range[3] then
return range[1]
else
iStart = iMid + 1
end
end
end
error(string.format("No block found for codepoint U+%04X.", codepoint))
end
end


function export.get_block_range(name)
function p.get_block_info(name)
local range
for i, block in ipairs(loader.blocks) do
if block[3] == name then
for i, block in ipairs(blocks) do
return block
if block[1] == name then
range = block
end
end
end
if range then
return range[2], range[3]
end
end
end
end


function export.is_valid_pagename(pagename)
function p.is_valid_pagename(pagename)
local has_nonws = false
local has_nonws = false


Line 497: Line 238:
end
end


local printable, result = export.is_printable(cp)
local printable, result = p.is_printable(cp)
if not printable then
if not printable then
return false
return false
Line 511: Line 252:


local function manual_unpack(what, from)
local function manual_unpack(what, from)
if what[from + 1] == nil then
return what[from]
end
local result = {}
local result = {}
from = from or 1
from = from or 1
Line 521: Line 266:
end
end


local function memo_lookup(loader, match_func, ...)
local function compare_ranges(range1, range2)
return range1[1] < range2[1]
end
 
-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
local dots = { ... }
local dots = { ... }
local cache = {}
local cache = {}
Line 528: Line 289:
return function (codepoint)
return function (codepoint)
if not singles then
if not singles then
singles, ranges = loader()
local data_module = loader[data_module_subpage]
singles, ranges = data_module.singles, data_module.ranges
end
end


Line 535: Line 297:
end
end


local lastlast = -1
local range = binary_range_search(codepoint, cache)
for _, range in pairs(cache) do
if range then
if (range[1] <= codepoint) and (codepoint <= range[2]) then
return match_func(codepoint, manual_unpack(range, 3))
return match_func(codepoint, unpack(range, 3))
end
end
local range, index = binary_range_search(codepoint, ranges)
if range then
table.insert(cache, range)
table.sort(cache, compare_ranges)
return match_func(codepoint, manual_unpack(range, 3))
end
end
 
for _, range in pairs(ranges) do
if ranges[index] then
if codepoint < range[1] then
local dots_range
table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) })
if codepoint > ranges[index][2] then
return match_func(codepoint, unpack(dots))
dots_range = {
elseif codepoint <= range[2] then
ranges[index][2] + 1,
table.insert(cache, { manual_unpack(range) })
ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
return match_func(codepoint, manual_unpack(range, 3))
unpack(dots)
else
}
lastlast = range[2]
else -- codepoint < range[index][1]
dots_range = {
ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
ranges[index][1] - 1,
unpack(dots)
}
end
end
table.sort(cache, compare_ranges)
end
end
 
return match_func(codepoint)
return match_func(codepoint)
end
end
end
end


-- Get a codepoint's combining class value in [[Module:Unicode data/combining]],
-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- That is, return true if character is combining, or false if it is not.
-- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
-- more information.
export.is_combining = memo_lookup(function ()
p.is_combining = memo_lookup(
local m_comb = mw.loadData('Module:Unicode data/combining')
"combining",
return m_comb.single, m_comb.ranges
function (codepoint, combining_class)
end, function (codepoint, combining_class)
return combining_class and combining_class ~= 0 or false
return combining_class and combining_class ~= 0
end,
or false
0)
end, 0)


function export.add_dotted_circle(str)
function p.add_dotted_circle(str)
return (mw.ustring.gsub(str, ".",
return (mw.ustring.gsub(str, ".",
function(char)
function(char)
if export.is_combining(mw.ustring.codepoint(char)) then
if p.is_combining(mw.ustring.codepoint(char)) then
return '◌' .. char
return '◌' .. char
end
end
Line 581: Line 353:
end
end


local lookup_control = memo_lookup(function ()
local lookup_control = memo_lookup(
local m_cc = mw.loadData('Module:Unicode data/control')
"control",
return m_cc.single, m_cc.ranges
function (codepoint, ccc)
end, function (codepoint, ccc)
return ccc or "assigned"
return ccc or "assigned"
end,
end, "assigned")
"assigned")
p.lookup_control = lookup_control


function export.is_assigned(codepoint)
function p.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
return lookup_control(codepoint) ~= "unassigned"
end
end


function export.is_printable(codepoint)
function p.is_printable(codepoint)
local result = lookup_control(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
return (result == "assigned") or (result == "space-separator"), result
end
end


function export.is_whitespace(codepoint)
function p.is_whitespace(codepoint)
local result = lookup_control(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
return (result == "space-separator"), result
end
end


-- to be used in language-neutral context only (e.g. character lists)
p.lookup_category = memo_lookup(
"category",
function (codepoint, category)
return category
end,
"Cn")


local script_pats
local lookup_script = memo_lookup(
"scripts",
function (codepoint, script_code)
return script_code or 'Zzzz'
end,
"Zzzz")
p.lookup_script = lookup_script


-- Scripts that consist entirely of characters from another script.
function p.get_best_script(str)
local script_blacklist = {
-- Check type of argument, because mw.text.decode coerces numbers to strings!
["Latf"] = true;
require "libraryUtil".checkType("get_best_script", 1, str, "string")
["Hans"] = true;
["Hant"] = true;
-- Convert HTML character references (including named character references,
["Kore"] = true;
-- or character entities) to characters.
["Jpan"] = true;
str = mw.text.decode(str, true)
["fa-Arab"] = true;
["kk-Arab"] = true;
local scripts = {}
["ks-Arab"] = true;
for codepoint in mw.ustring.gcodepoint(str) do
["ku-Arab"] = true;
local script = lookup_script(codepoint)
["mzn-Arab"] = true;
["ota-Arab"] = true;
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
["pa-Arab"] = true;
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
["ps-Arab"] = true;
scripts[script] = true
["sd-Arab"] = true;
["tt-Arab"] = true;
["ug-Arab"] = true;
["ur-Arab"] = true;
["nv-Latn"] = true;
["pjt-Latn"] = true;
["Zyyy"] = true;
}
 
--[[
Problem scripts: Grek and polytonic, Cyrl and Cyrs, Latn and Latinx.
In each key-value pair, the value should take precedence over the key.
]]
 
local overridden_by = {
["Cyrs"] = "Cyrl",
["polytonic"] = "Grek",
["Latinx"] = "Latn",
}
 
local script_cache = {}
 
function export.get_script(codepoint)
local text
if type(codepoint) == "number" then
text = mw.ustring.char(codepoint)
elseif type(codepoint) == "string" then
text = codepoint
else
error("Argument to get_script should be a number (codepoint) or string.")
end
 
for pat, sc in pairs(script_cache) do
if mw.ustring.match(text, pat) and not overridden_by[sc] then
return sc
end
end
end
end
-- If scripts does not contain two or more keys,
-- return first and only key (script code) in table.
if not next(scripts, next(scripts)) then
return next(scripts)
end -- else return majority script, or else "Zzzz"?
end


if not script_pats then
function p.is_Latin(str)
local m_scripts = mw.loadData("Module:scripts/data")
require "libraryUtil".checkType("get_best_script", 1, str, "string")
script_pats = {}
str = mw.text.decode(str, true)
for sc, info in pairs(m_scripts) do
if info.characters and not script_blacklist[sc] then
-- Search for the leading bytes that introduce the UTF-8 encoding of the
script_pats[sc] = "[" .. info.characters .. "]"
-- code points U+0340-U+10FFFF. If they are not found and there is at least
-- one Latin-script character, the string counts as Latin, because the rest
-- of the characters can only be Zyyy, Zinh, and Zzzz.
-- The only scripts found below U+0370 (the first code point of the Greek
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
-- See the codepage in the [[UTF-8]] article.
if not str:find "[\205-\244]" then
for codepoint in mw.ustring.gcodepoint(str) do
if lookup_script(codepoint) == "Latn" then
return true
end
end
end
end
end
end
 
for sc, pat in pairs(script_pats) do
local Latn = false
if mw.ustring.match(text, pat) then
local overriding = overridden_by[sc]
for codepoint in mw.ustring.gcodepoint(str) do
if overriding and script_pats[overriding] and mw.ustring.match(text, script_pats[overriding]) then
local script = lookup_script(codepoint)
script_cache[script_pats[overriding]] = overriding
return overriding
if script == "Latn" then
else
Latn = true
script_cache[pat] = sc
elseif not (script == "Zyyy" or script == "Zinh"
return sc
or script == "Zzzz") then
end
return false
end
end
end
end
 
return "None"
return Latn
end
end


local function sortRange(range1, range2)
-- Checks that a string contains only characters belonging to right-to-left
return range1[1] < range2[1]
-- scripts, or characters of ignorable scripts.
end
function p.is_rtl(str)
 
require "libraryUtil".checkType("get_best_script", 1, str, "string")
--[[
str = mw.text.decode(str, true)
Binary search: more efficient for the longer lists of codepoint ranges than
for the shorter ones.
-- Search for the leading bytes that introduce the UTF-8 encoding of the
]]
-- code points U+0580-U+10FFFF. If they are not found, the string can only
local function binary_search(ranges, value)
-- have characters from a left-to-right script, because the first code point
if not ranges then
-- in a right-to-left script is U+0591, in the Hebrew block.
return nil
if not str:find "[\214-\244]" then
return false
end
end
-- Initialize numbers.
local iStart, iMid = 1, 0
local result = false
-- Can't use # because table is loaded by mw.loadData.
local rtl = loader.scripts.rtl
local iEnd = ranges.length or require("Module:table").size(ranges)
for codepoint in mw.ustring.gcodepoint(str) do
 
local script = lookup_script(codepoint)
if iEnd == 0 then
return nil
if rtl[script] then
result = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false
end
end
end
return result
end


local iterations = 0


-- Do search.
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------
while iStart <= iEnd do
iterations = iterations + 1


-- Calculate middle.
external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so
iMid = floor((iStart + iEnd) / 2)
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.


-- Get compare value.
]]
local range = ranges[iMid]


if range[1] > value then
function p.is_rtl_frame (frame)
iEnd = iMid - 1
local str = frame.args[1]; -- get the string from the {{#invoke:}} frame
str = str:gsub ('%b<>', ''); -- strip any html and html-like tags
return p.is_rtl (str); -- return if whatever remains rtl; false else
end


-- Return matching index. Assumes there are no duplicates.
elseif value <= range[2] then
return range


-- Keep searching.
local function get_codepoint(args, arg)
else
local codepoint_string = args[arg]
iStart = iMid + 1
or errorf(2, "Parameter %s is required", tostring(arg))
end
local codepoint = tonumber(codepoint_string, 16)
or errorf(2, "Parameter %s is not a code point in hexadecimal base",
tostring(arg))
if not (0 <= codepoint and codepoint <= 0x10FFFF) then
errorf(2, "code point in parameter %s out of range", tostring(arg))
end
end
return nil
return codepoint
end
end


local function look_up_in_order(number, ranges)
local function get_func(args, arg, prefix)
for i, range in ipairs(ranges) do
local suffix = args[arg]
if number < range[1] then
or errorf(2, "Parameter %s is required", tostring(arg))
return nil
suffix = mw.text.trim(suffix)
elseif number <= range[2] then
local func_name = prefix .. suffix
return range[3]
local func = p[func_name]
end
or errorf(2, "There is no function '%s'", func_name)
end
return func
end
end


-- Save previously used codepoint ranges in case another character is in the
-- This function allows any of the "lookup" functions to be invoked. The first
-- same range.
-- parameter is the word after "lookup_"; the second parameter is the code point
local ranges_cache = {}
-- in hexadecimal base.
 
function p.lookup(frame)
--[=[
local func = get_func(frame.args, 1, "lookup_")
Takes a codepoint or a character and finds the script code (if any) that is
local codepoint = get_codepoint(frame.args, 2)
appropriate for it based on the codepoint, using the data module
local result = func(codepoint)
[[Module:Unicode data/scripts]]. The data module was generated from the
if func == p.lookup_name then
patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]].
-- Prevent code point labels such as <control-0000> from being
 
-- interpreted as HTML tags.
Converts the character to a codepoint. Returns a script code if the codepoint
result = result:gsub("<", "&lt;")
is in the list of individual characters, or if it is in one of the defined
ranges in the 4096-character block that it belongs to, else returns "None".
]=]
function export.char_to_script(char)
local lookup = mw.loadData("Module:Unicode data/scripts")
local t = type(char)
local codepoint
if t == "string" then
local etc
codepoint, etc = mw.ustring.codepoint(char)
if etc then
error("Argument to char_to_script should be a single character.")
end
elseif t == "number" then
codepoint = char
else
error("Argument to char_to_script should be a string or a number, but its type is " .. t .. ".")
end
 
local individual_match = lookup.individual[codepoint]
if individual_match then
return individual_match
else
local script = look_up_in_order(codepoint, ranges_cache)
if script then
return script
end
 
local index = floor(codepoint / 0x1000)
 
script = look_up_in_order(index, lookup.blocks)
if script then
return script
end
 
local range = binary_search(lookup[index], codepoint)
if range then
table.insert(ranges_cache, range)
table.sort(ranges_cache, sortRange)
return range[3]
end
end
end
 
return result
return "None"
end
end


function export.find_best_script(text)
function p.is(frame)
local scripts = {}
local func = get_func(frame.args, 1, "is_")
for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do
local script = export.char_to_script(character)
scripts[script] = (scripts[script] or 0) + 1
end
local best_script
local greatest_count = 0
for script, count in pairs(scripts) do
if count > greatest_count then
best_script = script
greatest_count = count
end
end
return best_script
-- is_Latin and is_valid_pagename take strings.
end
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
 
return (func(frame.args[2]))
local unsupported_title = {
else -- The rest take code points.
[0x0020] = "Unsupported titles/Space";
local codepoint = get_codepoint(frame.args, 2)
[0x0023] = "Unsupported titles/Number sign";
return (func(codepoint)) -- Adjust to one result.
[0x002E] = "Unsupported titles/Full stop";
[0x003A] = "Unsupported titles/Colon";
[0x003C] = "Unsupported titles/Less than";
[0x003E] = "Unsupported titles/Greater than";
[0x005B] = "Unsupported titles/Left square bracket";
[0x005D] = "Unsupported titles/Right square bracket";
[0x005F] = "Unsupported titles/Low line";
[0x007B] = "Unsupported titles/Left curly bracket";
[0x007C] = "Unsupported titles/Vertical line";
[0x007D] = "Unsupported titles/Right curly bracket";
[0x1680] = "Unsupported titles/Ogham space";
[0xFFFD] = "Unsupported titles/Replacement character";
}
 
function export.get_entry_title(codepoint)
if unsupported_title[codepoint] then
return unsupported_title[codepoint]
end
if lookup_control(codepoint) ~= "assigned" then
return nil
end
end
return mw.ustring.char(codepoint)
end
end


return export
return p

Latest revision as of 20:43, 28 November 2023

Usage

This module provides functions that access information on Unicode code points. The information is retrieved from data modules generated from the Unicode Character Database, or derived by rules given in the Unicode Specification. It and its submodules were copied from English Wiktionary and then modified; see there for more information.

Functions

<syntaxhighlight lang="lua" class="" id="" style="" inline="1">lookup_name(codepoint)</syntaxhighlight>
Receives a codepoint (number) and returns its name or label; for example, <syntaxhighlight lang="lua" class="" id="" style="" inline="1">lookup_name(0xA9)</syntaxhighlight> returns <syntaxhighlight lang="lua" class="" id="" style="" inline="1">"COPYRIGHT SIGN"</syntaxhighlight>.
<syntaxhighlight lang="lua" class="" id="" style="" inline="1">lookup, is</syntaxhighlight>
Template-invokable functions that allow access to the functions starting with lookup and is. Replace the first underscore in the function name with a pipe, and add the codepoint in hexadecimal base, or a bit of text, for is_Latin, is_rtl, and is_valid_pagename, as the next parameter. For example, {{#invoke:Unicode data|lookup|name|61}} → <reserved-0061>; {{#invoke:Unicode data|is|Latin|àzàhàr̃iyyā̀}} → true.

Data modules

The data used by functions in this module is found in submodules. Some are generated by AWK scripts shown at User:Kephir/Unicode on English Wiktionary, others by Lua scripts on the /make subpages of the submodules.

The name data modules (Module:Unicode data/names/xxx) were compiled from UnicodeData.txt. Each one contains, at maximum, code points U+xxx000 to U+xxxFFF. Lua error in mw.title.lua at line 209: too many expensive function calls.

Copyright

The Unicode database is released by Unicode Inc. under the following terms:

Copyright © 1991-2018 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in https://www.unicode.org/copyright.html.

Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that either (a) this copyright and permission notice appear with all copies of the Data Files or Software, or (b) this copyright and permission notice appear in associated Documentation.

THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.

Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder.


local p = {}

local floor = math.floor

local function errorf(level, ...)
	if type(level) == "number" then
		return error(string.format(...), level + 1)
	else -- level is actually the format string.
		return error(string.format(level, ...), 2)
	end
end

local function binary_range_search(codepoint, ranges)
	local low, mid, high
	low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
	while low <= high do
		mid = floor((low + high) / 2)
		local range = ranges[mid]
		if codepoint < range[1] then
			high = mid - 1
		elseif codepoint <= range[2] then
			return range, mid
		else
			low = mid + 1
		end
	end
	return nil, mid
end
p.binary_range_search = binary_range_search

--[[
local function linear_range_search(codepoint, ranges)
	for i, range in ipairs(ranges) do
		if range[1] <= codepoint and codepoint <= range[2] then
			return range
		end
	end
end
--]]

-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
	__index = function (self, key)
		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
		if not success then
			data = false
		end
		self[key] = data
		return data
	end
})

-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local name_hooks = {
	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters
	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters
	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables
		local Hangul_data = loader.Hangul
		local syllable_index = codepoint - 0xAC00

		return ("HANGUL SYLLABLE %s%s%s"):format(
			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
				/ Hangul_data.trail_count)],
			Hangul_data.trails[syllable_index % Hangul_data.trail_count]
		)
	end },
	-- High Surrogates, High Private Use Surrogates, Low Surrogates
	{   0xD800,   0xDFFF, "<surrogate-%04X>" },
	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use
	-- CJK Compatibility Ideographs
	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
	{  0x18800,  0x18AFF, function (codepoint)
		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
	end },
	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
	end},
	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
	{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
}
name_hooks.length = #name_hooks

local name_range_cache

local function generate_name(data, codepoint)
	if type(data) == "string" then
		return data:format(codepoint)
	else
		return data(codepoint)
	end
end

--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
	if codepoint < 0 or 0x10FFFF < codepoint then
		errorf("Codepoint %04X out of range", codepoint)
	end
end
--]]

-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
	-- (Cn) and specifically noncharacters:
	-- https://www.unicode.org/faq/private_use.html#nonchar4
	if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
			or floor(codepoint % 0x10000) >= 0xFFFE) then
		return ("<noncharacter-%04X>"):format(codepoint)
	end

	if name_range_cache -- Check if previously used "name hook" applies to this code point.
			and codepoint >= name_range_cache[1]
			and codepoint <= name_range_cache[2] then
		return generate_name(name_range_cache[3], codepoint)
	end
	
	local range = binary_range_search(codepoint, name_hooks)
	if range then
		name_range_cache = range
		return generate_name(range[3], codepoint)
	end

	local data = loader[('names/%03X'):format(codepoint / 0x1000)]
	
	if data and data[codepoint] then
		return data[codepoint]
	
	-- Unassigned (Cn) consists of noncharacters and reserved characters.
	-- The character has been established not to be a noncharacter,
	-- and if it were assigned, its name would already been retrieved,
	-- so it must be reserved.
	else
		return ("<reserved-%04X>"):format(codepoint)
	end
end

function p.lookup_image(codepoint)
	local data = loader[('images/%03X'):format(codepoint / 0x1000)]
	
	if data then
		return data[codepoint]
	end
end

local planes = {
	[ 0] = "Basic Multilingual Plane";
	[ 1] = "Supplementary Multilingual Plane";
	[ 2] = "Supplementary Ideographic Plane";
	[ 3] = "Tertiary Ideographic Plane";
	[14] = "Supplementary Special-purpose Plane";
	[15] = "Supplementary Private Use Area-A";
	[16] = "Supplementary Private Use Area-B";
}

-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks

local function block_iter(blocks, i)
	i = i + 1
	local data = blocks[i]
	if data then
		 -- Unpack doesn't work on tables loaded with mw.loadData.
		return i, data[1], data[2], data[3]
	end
end

-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
	local blocks = loader.blocks
	return block_iter, blocks, 0
end

function p.lookup_plane(codepoint)
	local i = floor(codepoint / 0x10000)
	return planes[i] or ("Plane %u"):format(i)
end

function p.lookup_block(codepoint)
	local blocks = loader.blocks
	local range = binary_range_search(codepoint, blocks)
	if range then
		return range[3]
	else
		return "No Block"
	end
end

function p.get_block_info(name)
	for i, block in ipairs(loader.blocks) do
		if block[3] == name then
			return block
		end
	end
end

function p.is_valid_pagename(pagename)
	local has_nonws = false

	for cp in mw.ustring.gcodepoint(pagename) do
		if (cp == 0x0023) -- #
		or (cp == 0x005B) -- [
		or (cp == 0x005D) -- ]
		or (cp == 0x007B) -- {
		or (cp == 0x007C) -- |
		or (cp == 0x007D) -- }
		or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
		or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
		or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
		then
			return false
		end

		local printable, result = p.is_printable(cp)
		if not printable then
			return false
		end

		if result ~= "space-separator" then
			has_nonws = true
		end
	end

	return has_nonws
end

local function manual_unpack(what, from)
	if what[from + 1] == nil then
		return what[from]
	end
	
	local result = {}
	from = from or 1
	for i, item in ipairs(what) do
		if i >= from then
			table.insert(result, item)
		end
	end
	return unpack(result)
end

local function compare_ranges(range1, range2)
	return range1[1] < range2[1]
end

-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
	local dots = { ... }
	local cache = {}
	local singles, ranges

	return function (codepoint)
		if not singles then
			local data_module = loader[data_module_subpage]
			singles, ranges = data_module.singles, data_module.ranges
		end

		if singles[codepoint] then
			return match_func(codepoint, singles[codepoint])
		end

		local range = binary_range_search(codepoint, cache)
		if range then
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		local range, index = binary_range_search(codepoint, ranges)
		if range then
			table.insert(cache, range)
			table.sort(cache, compare_ranges)
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		if ranges[index] then
			local dots_range
			if codepoint > ranges[index][2] then
				dots_range = {
					ranges[index][2] + 1,
					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
					unpack(dots)
				}
			else -- codepoint < range[index][1]
				dots_range = {
					ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
					ranges[index][1] - 1,
					unpack(dots)
				}
			end
			table.sort(cache, compare_ranges)
		end
		
		return match_func(codepoint)
	end
end

-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(
	"combining",
	function (codepoint, combining_class)
		return combining_class and combining_class ~= 0 or false
	end,
	0)

function p.add_dotted_circle(str)
	return (mw.ustring.gsub(str, ".",
		function(char)
			if p.is_combining(mw.ustring.codepoint(char)) then
				return '◌' .. char
			end
		end))
end

local lookup_control = memo_lookup(
	"control",
	function (codepoint, ccc)
		return ccc or "assigned"
	end,
	"assigned")
p.lookup_control = lookup_control

function p.is_assigned(codepoint)
	return lookup_control(codepoint) ~= "unassigned"
end

function p.is_printable(codepoint)
	local result = lookup_control(codepoint)
	return (result == "assigned") or (result == "space-separator"), result
end

function p.is_whitespace(codepoint)
	local result = lookup_control(codepoint)
	return (result == "space-separator"), result
end

p.lookup_category = memo_lookup(
	"category",
	function (codepoint, category)
		return category
	end,
	"Cn")

local lookup_script = memo_lookup(
	"scripts",
	function (codepoint, script_code)
		return script_code or 'Zzzz'
	end,
	"Zzzz")
p.lookup_script = lookup_script

function p.get_best_script(str)
	-- Check type of argument, because mw.text.decode coerces numbers to strings!
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	
	-- Convert HTML character references (including named character references,
	-- or character entities) to characters.
	str = mw.text.decode(str, true)
	
	local scripts = {}
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
			scripts[script] = true
		end
	end
	
	-- If scripts does not contain two or more keys,
	-- return first and only key (script code) in table.
	if not next(scripts, next(scripts)) then
		return next(scripts)
	end -- else return majority script, or else "Zzzz"?
end

function p.is_Latin(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0340-U+10FFFF. If they are not found and there is at least
	-- one Latin-script character, the string counts as Latin, because the rest
	-- of the characters can only be Zyyy, Zinh, and Zzzz.
	-- The only scripts found below U+0370 (the first code point of the Greek
	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
	-- See the codepage in the [[UTF-8]] article.
	if not str:find "[\205-\244]" then
		for codepoint in mw.ustring.gcodepoint(str) do
			if lookup_script(codepoint) == "Latn" then
				return true
			end
		end
	end
	
	local Latn = false
	
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		if script == "Latn" then
			Latn = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false
		end
	end
	
	return Latn
end

-- Checks that a string contains only characters belonging to right-to-left
-- scripts, or characters of ignorable scripts.
function p.is_rtl(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0580-U+10FFFF. If they are not found, the string can only
	-- have characters from a left-to-right script, because the first code point
	-- in a right-to-left script is U+0591, in the Hebrew block.
	if not str:find "[\214-\244]" then
		return false
	end
	
	local result = false
	local rtl = loader.scripts.rtl
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		if rtl[script] then
			result = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false
		end
	end
	
	return result
end


--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------

external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.

]]

function p.is_rtl_frame (frame)
	local str = frame.args[1];													-- get the string from the {{#invoke:}} frame
	str = str:gsub ('%b<>', '');												-- strip any html and html-like tags
	return p.is_rtl (str);														-- return if whatever remains rtl; false else
end


local function get_codepoint(args, arg)
	local codepoint_string = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	local codepoint = tonumber(codepoint_string, 16)
		or errorf(2, "Parameter %s is not a code point in hexadecimal base",
			tostring(arg))
	if not (0 <= codepoint and codepoint <= 0x10FFFF) then
		errorf(2, "code point in parameter %s out of range", tostring(arg))
	end
	return codepoint
end

local function get_func(args, arg, prefix)
	local suffix = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	suffix = mw.text.trim(suffix)
	local func_name = prefix .. suffix
	local func = p[func_name]
		or errorf(2, "There is no function '%s'", func_name)
	return func
end

-- This function allows any of the "lookup" functions to be invoked. The first
-- parameter is the word after "lookup_"; the second parameter is the code point
-- in hexadecimal base.
function p.lookup(frame)
	local func = get_func(frame.args, 1, "lookup_")
	local codepoint = get_codepoint(frame.args, 2)
	local result = func(codepoint)
	if func == p.lookup_name then
		-- Prevent code point labels such as <control-0000> from being
		-- interpreted as HTML tags.
		result = result:gsub("<", "&lt;")
	end
	return result
end

function p.is(frame)
	local func = get_func(frame.args, 1, "is_")
	
	-- is_Latin and is_valid_pagename take strings.
	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
		return (func(frame.args[2]))
	else -- The rest take code points.
		local codepoint = get_codepoint(frame.args, 2)
		return (func(codepoint)) -- Adjust to one result.
	end
end

return p