Module:Unicode data: Difference between revisions
meta>Erutuon m (merged surrogates again) |
meta>Erutuon (moved Hangul data to Module:Unicode data/Hangul) |
||
Line 2: | Line 2: | ||
local floor = math.floor | local floor = math.floor | ||
-- For the algorithm used to generate Hangul Syllable names, | -- For the algorithm used to generate Hangul Syllable names, | ||
Line 10: | Line 7: | ||
-- Unicode Specification: | -- Unicode Specification: | ||
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | ||
local | local Hangul_data -- loaded if needed | ||
-- | |||
local name_hooks = { | local name_hooks = { | ||
Line 40: | Line 15: | ||
{ 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph | { 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph | ||
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables | { 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables | ||
Hangul_data = Hangul_data or mw.loadData("Module:Unicode data/Hangul") | |||
local syllable_index = codepoint - 0xAC00 | local syllable_index = codepoint - 0xAC00 | ||
return ("HANGUL SYLLABLE %s%s%s"):format( | return ("HANGUL SYLLABLE %s%s%s"):format( | ||
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)], | |||
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count) | |||
/ Hangul_data.trail_count)], | |||
Hangul_data.trails[syllable_index % Hangul_data.trail_count] | |||
) | ) | ||
end }, | end }, | ||
Line 63: | Line 40: | ||
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | { 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | ||
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | ||
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) | ||
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | |||
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | { 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | ||
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) |
Revision as of 03:35, 29 June 2018
This Lua module is used on approximately 1,100,000 pages, or roughly 16687% of all pages. To avoid major disruption and server load, any changes should be tested in the module's /sandbox or /testcases subpages, or in your own module sandbox. The tested changes can be added to this page in a single edit. Consider discussing changes on the talk page before implementing them. |
Usage
This module provides functions that access information on Unicode code points. The information is retrieved from data modules generated from the Unicode Character Database, or derived by rules given in the Unicode Specification. It and its submodules were copied from English Wiktionary and then modified; see there for more information.
Functions
- <syntaxhighlight lang="lua" class="" id="" style="" inline="1">lookup_name(codepoint)</syntaxhighlight>
- Receives a codepoint (number) and returns its name or label; for example, <syntaxhighlight lang="lua" class="" id="" style="" inline="1">lookup_name(0xA9)</syntaxhighlight> returns <syntaxhighlight lang="lua" class="" id="" style="" inline="1">"COPYRIGHT SIGN"</syntaxhighlight>.
- <syntaxhighlight lang="lua" class="" id="" style="" inline="1">lookup, is</syntaxhighlight>
- Template-invokable functions that allow access to the functions starting with
lookup
andis
. Replace the first underscore in the function name with a pipe, and add the codepoint in hexadecimal base, or a bit of text, foris_Latin
,is_rtl
, andis_valid_pagename
, as the next parameter. For example,{{#invoke:Unicode data|lookup|name|61}}
→ Script error: The function "lookup" does not exist.;{{#invoke:Unicode data|is|Latin|àzàhàr̃iyyā̀}}
→ Script error: The function "is" does not exist..
Data modules
The data used by functions in this module is found in submodules. Some are generated by AWK scripts shown at User:Kephir/Unicode on English Wiktionary, others by Lua scripts on the /make
subpages of the submodules.
- Module:Unicode data/aliases: the formal name aliases for characters (from NameAliases.txt)
- Module:Unicode data/blocks: the list of Unicode blocks (from Blocks.txt)
- Module:Unicode data/category: data mapping characters to their General Category (from DerivedGeneralCategory.txt)
- Module:Unicode data/control: data for identifying characters that belong to the General Categories of Separator and Other (from DerivedGeneralCategory.txt)
- Module:Unicode data/combining: data mapping characters to their Combining Classes (from DerivedCombiningClass.txt)
- Module:Unicode data/Hangul: data used to generate the names of Hangul syllables (from Jamo.txt)
- Module:Unicode data/scripts: data mapping characters to their Unicode script properties (from Scripts.txt).
The name data modules (Module:Unicode data/names/xxx) were compiled from UnicodeData.txt. Each one contains, at maximum, code points U+xxx000 to U+xxxFFF. Lua error in mw.title.lua at line 209: too many expensive function calls.
Copyright
The Unicode database is released by Unicode Inc. under the following terms:
Copyright © 1991-2018 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that either (a) this copyright and permission notice appear with all copies of the Data Files or Software, or (b) this copyright and permission notice appear in associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder.
local p = {} local floor = math.floor -- For the algorithm used to generate Hangul Syllable names, -- see "Hangul Syllable Name Generation" in section 3.12 of the -- Unicode Specification: -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf local Hangul_data -- loaded if needed local name_hooks = { { 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters { 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters { 0x3400, 0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A { 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph { 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables Hangul_data = Hangul_data or mw.loadData("Module:Unicode data/Hangul") local syllable_index = codepoint - 0xAC00 return ("HANGUL SYLLABLE %s%s%s"):format( Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)], Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count) / Hangul_data.trail_count)], Hangul_data.trails[syllable_index % Hangul_data.trail_count] ) end }, -- High Surrogates, High Private Use Surrogates, Low Surrogates { 0xD800, 0xDFFF, "<surrogate-%04X>" }, { 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use { 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, -- CJK Compatibility Ideographs { 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, { 0x17000, 0x187F1, "TANGUT IDEOGRAPH-%04X" }, -- Tangut { 0x18800, 0x18AF2, function (codepoint) return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) end }, { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu { 0x20000, 0x2A6D6, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B { 0x2A700, 0x2B734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C { 0x2A740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D { 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) { 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, { 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) end}, { 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use { 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use } local name_range_cache local function generate_name(data, codepoint) if type(data) == "string" then return data:format(codepoint) else return data(codepoint) end end -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 function p.lookup_name(codepoint) require 'libraryUtil'.checkType('lookup_name', 1, codepoint, 'number') if codepoint < 0 or 0x10FFFF < codepoint then error(("Codepoint %04X out of range"):format(codepoint)) end -- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are Unassigned -- (Cn) and specifically noncharacters: -- https://www.unicode.org/faq/private_use.html#nonchar4 if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF or floor(codepoint % 0x10000) >= 0xFFFE) then return ("<noncharacter-%04X>"):format(codepoint) end if name_range_cache -- Check if previously used "name hook" applies to this codepoint. and codepoint >= name_range_cache[1] and codepoint <= name_range_cache[2] then return generate_name(name_range_cache[3], codepoint) end for _, item in ipairs(name_hooks) do if codepoint < item[1] then break elseif codepoint <= item[2] then -- Save "name hook" in case another character -- from the same range will be looked up in the same module invocation. name_range_cache = item return generate_name(item[3], codepoint) end end local success, data = pcall(mw.loadData, ('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) if success and data[codepoint] then return data[codepoint] -- Unassigned (Cn) consists of noncharacters and reserved characters. -- The character has been established not to be a noncharacter, -- and if it were assigned, its name would already been retrieved, -- so it must be reserved. else return ("<reserved-%04X>"):format(codepoint) end end function p.lookup_image(codepoint) local success, data = pcall(mw.loadData, ('Module:Unicode data/images/%03X'):format(codepoint / 0x1000) ) if success then return data[codepoint] end end function p.template_lookup_name(frame) local param = frame.args[1] or frame:getParent().args[1] local codepoint = tonumber(param, 16) if not codepoint then error(("Expected a codepoint in hexadecimal base, got '%s'"):format(param)) end local name = p.lookup_name(codepoint):gsub("<", "<") return name end local planes = { [ 0] = "Basic Multilingual Plane"; [ 1] = "Supplementary Multilingual Plane"; [ 2] = "Supplementary Ideographic Plane"; [13] = "Supplementary Special-purpose Plane"; [14] = "Supplementary Private Use Area-A"; [15] = "Supplementary Private Use Area-B"; } -- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. local blocks function p.enum_blocks() blocks = blocks or mw.loadData("Module:Unicode data/blocks") return function (blocks, i) i = i + 1 local data = blocks[i] if not data then return nil end return i, unpack(data) end, blocks, 0 end function p.lookup_plane(codepoint) local i = floor(codepoint / 0x10000) return planes[i] or ("Plane %u"):format(i) end -- Binary search, to avoid iterating over entire table in order to look up the -- higher codepoints. function p.lookup_block(codepoint) blocks = blocks or mw.loadData("Module:Unicode data/blocks") local iStart, iEnd = 1, blocks.length or #blocks while iStart <= iEnd do local iMid = floor((iStart + iEnd) / 2) local range = blocks[iMid] if codepoint < range[1] then iEnd = iMid - 1 elseif codepoint <= range[2] then return range[3] else iStart = iMid + 1 end end error(string.format("No block found for codepoint U+%04X.", codepoint)) end function p.get_block_range(name) local range blocks = blocks or mw.loadData("Module:Unicode data/blocks") for i, block in ipairs(blocks) do if block[3] == name then range = block end end if range then return range[1], range[2] end end function p.is_valid_pagename(pagename) local has_nonws = false for cp in mw.ustring.gcodepoint(pagename) do if (cp == 0x0023) -- # or (cp == 0x005B) -- [ or (cp == 0x005D) -- ] or (cp == 0x007B) -- { or (cp == 0x007C) -- | or (cp == 0x007D) -- } or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block or (cp == 0xFFFD) -- REPLACEMENT CHARACTER then return false end local printable, result = p.is_printable(cp) if not printable then return false end if result ~= "space-separator" then has_nonws = true end end return has_nonws end local function manual_unpack(what, from) local result = {} from = from or 1 for i, item in ipairs(what) do if i >= from then table.insert(result, item) end end return unpack(result) end local function memo_lookup(loader, match_func, ...) local dots = { ... } local cache = {} local singles, ranges return function (codepoint) if not singles then singles, ranges = loader() end if singles[codepoint] then return match_func(codepoint, singles[codepoint]) end local lastlast = -1 for _, range in pairs(cache) do if (range[1] <= codepoint) and (codepoint <= range[2]) then return match_func(codepoint, unpack(range, 3)) end end for _, range in pairs(ranges) do if codepoint < range[1] then table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) }) return match_func(codepoint, unpack(dots)) elseif codepoint <= range[2] then table.insert(cache, { manual_unpack(range) }) return match_func(codepoint, manual_unpack(range, 3)) else lastlast = range[2] end end return match_func(codepoint) end end -- Get a codepoint's combining class value in [[Module:Unicode data/combining]], -- and return whether this value is not zero. Zero is assigned as the default -- if the combining class value is not found in this data module. -- That is, return true if character is combining, or false if it is not. -- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for -- more information. p.is_combining = memo_lookup(function () local m_comb = mw.loadData('Module:Unicode data/combining') return m_comb.single, m_comb.ranges end, function (codepoint, combining_class) return combining_class and combining_class ~= 0 or false end, 0) function p.add_dotted_circle(str) return (mw.ustring.gsub(str, ".", function(char) if p.is_combining(mw.ustring.codepoint(char)) then return '◌' .. char end end)) end local lookup_control = memo_lookup(function () local m_cc = mw.loadData('Module:Unicode data/control') return m_cc.single, m_cc.ranges end, function (codepoint, ccc) return ccc or "assigned" end, "assigned") function p.is_assigned(codepoint) return lookup_control(codepoint) ~= "unassigned" end function p.is_printable(codepoint) local result = lookup_control(codepoint) return (result == "assigned") or (result == "space-separator"), result end function p.is_whitespace(codepoint) local result = lookup_control(codepoint) return (result == "space-separator"), result end local unsupported_title = { [0x0020] = "Unsupported titles/Space"; [0x0023] = "Unsupported titles/Number sign"; [0x002E] = "Unsupported titles/Full stop"; [0x003A] = "Unsupported titles/Colon"; [0x003C] = "Unsupported titles/Less than"; [0x003E] = "Unsupported titles/Greater than"; [0x005B] = "Unsupported titles/Left square bracket"; [0x005D] = "Unsupported titles/Right square bracket"; [0x005F] = "Unsupported titles/Low line"; [0x007B] = "Unsupported titles/Left curly bracket"; [0x007C] = "Unsupported titles/Vertical line"; [0x007D] = "Unsupported titles/Right curly bracket"; [0x1680] = "Unsupported titles/Ogham space"; [0xFFFD] = "Unsupported titles/Replacement character"; } function p.get_entry_title(codepoint) if unsupported_title[codepoint] then return unsupported_title[codepoint] end if lookup_control(codepoint) ~= "assigned" then return nil end return mw.ustring.char(codepoint) end return p