Jump to content

Module:Unicode data/sandbox

From Wikipedia, the free encyclopedia
localp={}localfloor=math.floorlocalfunctionerrorf(level,...)iftype(level)=="number"thenreturnerror(string.format(...),level+1)else-- level is actually the format string.returnerror(string.format(level,...),2)endendlocalfunctionbinary_range_search(codepoint,ranges)locallow,mid,highlow,high=1,ranges.lengthorrequire"Module:TableTools".length(ranges)whilelow<=highdomid=floor((low+high)/2)localrange=ranges[mid]ifcodepoint<range[1]thenhigh=mid-1elseifcodepoint<=range[2]thenreturnrange,midelselow=mid+1endendreturnnil,midendp.binary_range_search=binary_range_search--[[local function linear_range_search(codepoint, ranges) for i, range in ipairs(ranges) do if range[1] <= codepoint and codepoint <= range[2] then return range end endend--]]-- Load a module by indexing "loader" with the name of the module minus the-- "Module:Unicode data/" part. For instance, loader.blocks returns-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be-- returned.localloader=setmetatable({},{__index=function(self,key)localsuccess,data=pcall(mw.loadData,"Module:Unicode data/"..key)ifnotsuccessthendata=falseendself[key]=datareturndataend})-- For the algorithm used to generate Hangul Syllable names,-- see "Hangul Syllable Name Generation" in section 3.12 of the-- Unicode Specification:-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdflocalname_hooks={{0x00,0x1F,"<control-%04X>"},-- C0 control characters{0x7F,0x9F,"<control-%04X>"},-- DEL and C1 control characters{0x3400,0x4DBF,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension A{0x4E00,0x9FFF,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph{0xAC00,0xD7A3,function(codepoint)-- Hangul SyllableslocalHangul_data=loader.Hangullocalsyllable_index=codepoint-0xAC00return("HANGUL SYLLABLE %s%s%s"):format(Hangul_data.leads[floor(syllable_index/Hangul_data.final_count)],Hangul_data.vowels[floor((syllable_index%Hangul_data.final_count)/Hangul_data.trail_count)],Hangul_data.trails[syllable_index%Hangul_data.trail_count])end},-- High Surrogates, High Private Use Surrogates, Low Surrogates{0xD800,0xDFFF,"<surrogate-%04X>"},{0xE000,0xF8FF,"<private-use-%04X>"},-- Private Use-- CJK Compatibility Ideographs{0xF900,0xFA6D,"CJK COMPATIBILITY IDEOGRAPH-%04X"},{0xFA70,0xFAD9,"CJK COMPATIBILITY IDEOGRAPH-%04X"},{0x17000,0x187F7,"TANGUT IDEOGRAPH-%04X"},-- Tangut Ideograph{0x18800,0x18AFF,function(codepoint)return("TANGUT COMPONENT-%03d"):format(codepoint-0x187FF)end},{0x18D00,0x18D08,"TANGUT IDEOGRAPH-%04X"},-- Tangut Ideograph Supplement{0x1B170,0x1B2FB,"NUSHU CHARACTER-%04X"},-- Nushu{0x20000,0x2A6DF,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension B{0x2A700,0x2B739,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension C{0x2B740,0x2B81D,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension D{0x2B820,0x2CEA1,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension E{0x2CEB0,0x2EBE0,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension F-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane){0x2F800,0x2FA1D,"CJK COMPATIBILITY IDEOGRAPH-%04X"},{0xE0100,0xE01EF,function(codepoint)-- Variation Selectors Supplementreturn("VARIATION SELECTOR-%d"):format(codepoint-0xE0100+17)end},{0x30000,0x3134A,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension G{0x31350,0x323AF,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension H{0x2EBF0,0x2EE5D,"CJK UNIFIED IDEOGRAPH-%04X"},-- CJK Ideograph Extension I{0xF0000,0xFFFFD,"<private-use-%04X>"},-- Plane 15 Private Use{0x100000,0x10FFFD,"<private-use-%04X>"}-- Plane 16 Private Use}name_hooks.length=#name_hookslocalname_range_cachelocalfunctiongenerate_name(data,codepoint)iftype(data)=="string"thenreturndata:format(codepoint)elsereturndata(codepoint)endend--[[-- Checks that the code point is a number and in range.-- Does not check whether code point is an integer.-- Not usedlocal function check_codepoint(funcName, argIdx, val) require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') if codepoint < 0 or 0x10FFFF < codepoint then errorf("Codepoint %04X out of range", codepoint) endend--]]functionp.is_noncharacter(codepoint)-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned-- (Cn) and specifically noncharacters:-- https://www.unicode.org/faq/private_use.html#nonchar4return0xFDD0<=codepointand(codepoint<=0xFDEForfloor(codepoint%0x10000)>=0xFFFE)end-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8functionp.lookup_name(codepoint)ifp.is_noncharacter(codepoint)thenreturn("<noncharacter-%04X>"):format(codepoint)endifname_range_cache-- Check if previously used "name hook" applies to this code point.andcodepoint>=name_range_cache[1]andcodepoint<=name_range_cache[2]thenreturngenerate_name(name_range_cache[3],codepoint)endlocalrange=binary_range_search(codepoint,name_hooks)ifrangethenname_range_cache=rangereturngenerate_name(range[3],codepoint)endlocaldata=loader[('names/%03X'):format(codepoint/0x1000)]ifdataanddata[codepoint]thenreturndata[codepoint]-- Unassigned (Cn) consists of noncharacters and reserved characters.-- The character has been established not to be a noncharacter,-- and if it were assigned, its name would already been retrieved,-- so it must be reserved.elsereturn("<reserved-%04X>"):format(codepoint)endendfunctionp.lookup_image(codepoint)localdata=loader[('images/%03X'):format(codepoint/0x1000)]ifdatathenreturndata[codepoint]endendlocalplanes={[0]="Basic Multilingual Plane";[1]="Supplementary Multilingual Plane";[2]="Supplementary Ideographic Plane";[3]="Tertiary Ideographic Plane";[14]="Supplementary Special-purpose Plane";[15]="Supplementary Private Use Area-A";[16]="Supplementary Private Use Area-B";}-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.localblockslocalfunctionblock_iter(blocks,i)i=i+1localdata=blocks[i]ifdatathen-- Unpack doesn't work on tables loaded with mw.loadData.returni,data[1],data[2],data[3]endend-- An ipairs-type iterator generator for the list of blocks.functionp.enum_blocks()localblocks=loader.blocksreturnblock_iter,blocks,0endfunctionp.lookup_plane(codepoint)locali=floor(codepoint/0x10000)returnplanes[i]or("Plane %u"):format(i)endfunctionp.lookup_block(codepoint)localblocks=loader.blockslocalrange=binary_range_search(codepoint,blocks)ifrangethenreturnrange[3]elsereturn"No Block"endendfunctionp.get_block_info(name)fori,blockinipairs(loader.blocks)doifblock[3]==namethenreturnblockendendendfunctionp.is_valid_pagename(pagename)localhas_nonws=falseforcpinmw.ustring.gcodepoint(pagename)doif(cp==0x0023)-- #or(cp==0x005B)-- [or(cp==0x005D)-- ]or(cp==0x007B)-- {or(cp==0x007C)-- |or(cp==0x007D)-- }or(cp==0x180E)-- MONGOLIAN VOWEL SEPARATORor((cp>=0x2000)and(cp<=0x200A))-- spaces in General Punctuation blockor(cp==0xFFFD)-- REPLACEMENT CHARACTERthenreturnfalseendlocalprintable,result=p.is_printable(cp)ifnotprintablethenreturnfalseendifresult~="space-separator"thenhas_nonws=trueendendreturnhas_nonwsendlocalfunctionmanual_unpack(what,from)ifwhat[from+1]==nilthenreturnwhat[from]endlocalresult={}from=fromor1fori,iteminipairs(what)doifi>=fromthentable.insert(result,item)endendreturnunpack(result)endlocalfunctioncompare_ranges(range1,range2)returnrange1[1]<range2[1]end-- Creates a function to look up data in a module that contains "singles" (a-- code point-to-data map) and "ranges" (an array containing arrays that contain-- the low and high code points of a range and the data associated with that-- range).-- "loader" loads and returns the "singles" and "ranges" tables.-- "match_func" is passed the code point and either the data or the "dots", and-- generates the final result of the function.-- The varargs ("dots") describes the default data to be returned if there wasn't-- a match.-- In case the function is used more than once, "cache" saves ranges that have-- already been found to match, or a range whose data is the default if there-- was no match.localfunctionmemo_lookup(data_module_subpage,match_func,...)localdots={...}localcache={}localsingles,rangesreturnfunction(codepoint)ifnotsinglesthenlocaldata_module=loader[data_module_subpage]singles,ranges=data_module.singles,data_module.rangesendifsingles[codepoint]thenreturnmatch_func(codepoint,singles[codepoint])endlocalrange=binary_range_search(codepoint,cache)ifrangethenreturnmatch_func(codepoint,manual_unpack(range,3))endlocalrange,index=binary_range_search(codepoint,ranges)ifrangethentable.insert(cache,range)table.sort(cache,compare_ranges)returnmatch_func(codepoint,manual_unpack(range,3))endifranges[index]thenlocaldots_rangeifcodepoint>ranges[index][2]thendots_range={ranges[index][2]+1,ranges[index+1]andranges[index+1][1]-1or0x10FFFF,unpack(dots)}else-- codepoint < range[index][1]dots_range={ranges[index-1]andranges[index-1][2]+1or0,ranges[index][1]-1,unpack(dots)}endtable.sort(cache,compare_ranges)endreturnmatch_func(codepoint)endend-- Get a code point's combining class value in [[Module:Unicode data/combining]],-- and return whether this value is not zero. Zero is assigned as the default-- if the combining class value is not found in this data module.-- That is, return true if character is combining, or false if it is not.-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for-- more information.p.is_combining=memo_lookup("combining",function(codepoint,combining_class)returncombining_classandcombining_class~=0orfalseend,0)functionp.add_dotted_circle(str)return(mw.ustring.gsub(str,".",function(char)ifp.is_combining(mw.ustring.codepoint(char))thenreturn'◌'..charendend))endlocallookup_control=memo_lookup("control",function(codepoint,ccc)returncccor"assigned"end,"assigned")p.lookup_control=lookup_controlfunctionp.is_assigned(codepoint)returnlookup_control(codepoint)~="unassigned"endfunctionp.is_printable(codepoint)localresult=lookup_control(codepoint)return(result=="assigned")or(result=="space-separator"),resultendfunctionp.is_whitespace(codepoint)localresult=lookup_control(codepoint)return(result=="space-separator"),resultendp.lookup_category=memo_lookup("category",function(codepoint,category)returncategoryend,"Cn")locallookup_script=memo_lookup("scripts",function(codepoint,script_code)returnscript_codeor'Zzzz'end,"Zzzz")p.lookup_script=lookup_scriptfunctionp.get_best_script(str)-- Check type of argument, because mw.text.decode coerces numbers to strings!require"libraryUtil".checkType("get_best_script",1,str,"string")-- Convert HTML character references (including named character references,-- or character entities) to characters.str=mw.text.decode(str,true)localscripts={}forcodepointinmw.ustring.gcodepoint(str)dolocalscript=lookup_script(codepoint)-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.ifnot(script=="Zyyy"orscript=="Zinh"orscript=="Zzzz")thenscripts[script]=trueendend-- If scripts does not contain two or more keys,-- return first and only key (script code) in table.ifnotnext(scripts,next(scripts))thenreturnnext(scripts)end-- else return majority script, or else "Zzzz"?endfunctionp.is_Latin(str)require"libraryUtil".checkType("get_best_script",1,str,"string")str=mw.text.decode(str,true)-- Search for the leading bytes that introduce the UTF-8 encoding of the-- code points U+0340-U+10FFFF. If they are not found and there is at least-- one Latin-script character, the string counts as Latin, because the rest-- of the characters can only be Zyyy, Zinh, and Zzzz.-- The only scripts found below U+0370 (the first code point of the Greek-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.-- See the codepage in the [[UTF-8]] article.ifnotstr:find"[\205-\244]"thenforcodepointinmw.ustring.gcodepoint(str)doiflookup_script(codepoint)=="Latn"thenreturntrueendendendlocalLatn=falselocali=0;-- indexer for use in error messagesforcodepointinmw.ustring.gcodepoint(str)doi=i+1;-- bump the indexerlocalscript=lookup_script(codepoint)ifscript=="Latn"thenLatn=trueelseifnot(script=="Zyyy"orscript=="Zinh"orscript=="Zzzz")thenreturnfalse,i-- abandon as not Latn; identify the offending character's positionendendreturnLatn,(notLatnandi)ornil-- when <Latn> false, return offending charactor's position as second return value; nil elseend-- Checks that a string contains only characters belonging to right-to-left-- scripts, or characters of ignorable scripts.functionp.is_rtl(str)require"libraryUtil".checkType("get_best_script",1,str,"string")str=mw.text.decode(str,true)-- Search for the leading bytes that introduce the UTF-8 encoding of the-- code points U+0580-U+10FFFF. If they are not found, the string can only-- have characters from a left-to-right script, because the first code point-- in a right-to-left script is U+0591, in the Hebrew block.ifnotstr:find"[\214-\244]"thenreturnfalseendlocalresult=falselocalrtl=loader.scripts.rtlforcodepointinmw.ustring.gcodepoint(str)dolocalscript=lookup_script(codepoint)ifrtl[script]thenresult=trueelseifnot(script=="Zyyy"orscript=="Zinh"orscript=="Zzzz")thenreturnfalseendendreturnresultend--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------external entry from an {{#invoke:}} to determine if a string of text is rtl. Strips html and html-like tags sothat those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl texthas <br /> tags.]]functionp.is_rtl_frame(frame)localstr=frame.args[1];-- get the string from the {{#invoke:}} framestr=str:gsub('%b<>','');-- strip any html and html-like tagsreturnp.is_rtl(str);-- return if whatever remains rtl; false elseendlocalfunctionget_codepoint(args,arg)localcodepoint_string=args[arg]orerrorf(2,"Parameter %s is required",tostring(arg))localcodepoint=tonumber(codepoint_string,16)orerrorf(2,"Parameter %s is not a code point in hexadecimal base",tostring(arg))ifnot(0<=codepointandcodepoint<=0x10FFFF)thenerrorf(2,"code point in parameter %s out of range",tostring(arg))endreturncodepointendlocalfunctionget_func(args,arg,prefix)localsuffix=args[arg]orerrorf(2,"Parameter %s is required",tostring(arg))suffix=mw.text.trim(suffix)localfunc_name=prefix..suffixlocalfunc=p[func_name]orerrorf(2,"There is no function '%s'",func_name)returnfuncend-- This function allows any of the "lookup" functions to be invoked. The first-- parameter is the word after "lookup_"; the second parameter is the code point-- in hexadecimal base.functionp.lookup(frame)localfunc=get_func(frame.args,1,"lookup_")localcodepoint=get_codepoint(frame.args,2)localresult=func(codepoint)iffunc==p.lookup_namethen-- Prevent code point labels such as <control-0000> from being-- interpreted as HTML tags.result=result:gsub("<","&lt;")endreturnresultendfunctionp.is(frame)localfunc=get_func(frame.args,1,"is_")-- is_Latin and is_valid_pagename take strings.iffunc==p.is_Latinorfunc==p.is_valid_pagenameorfunc==p.is_rtlthenreturn(func(frame.args[2]))else-- The rest take code points.localcodepoint=get_codepoint(frame.args,2)return(func(codepoint))-- Adjust to one result.endendfunctionp.lookup_kCantonese(codepoint)localdata=loader[('Unihan/kCantonese/%02X'):format(floor(codepoint/0x1000))]ifdatathenreturndata[codepoint]endendreturnp
close